From fae0f3dde83b7a54441f7a5bb0fc45d354fe81ce Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 17 Feb 2025 18:10:33 +0800
Subject: [PATCH 01/14] [tests] fix
 `EsmModelIntegrationTest::test_inference_bitsandbytes`  (#36225)

fix failed test
---
 tests/models/esm/test_modeling_esm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 7504ec2462eb..7be71c22c783 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -335,13 +335,13 @@ def test_inference_no_head(self):
     def test_inference_bitsandbytes(self):
         model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_8bit=True)
 
-        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
+        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device)
         # Just test if inference works
         with torch.no_grad():
             _ = model(input_ids)[0]
 
         model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_4bit=True)
 
-        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
+        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device)
         # Just test if inference works
         _ = model(input_ids)[0]

From 23d6095e8f1ea3b47f2744e2ff84e570046dd87a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:49:07 +0100
Subject: [PATCH 02/14] Fix
 `LlavaForConditionalGenerationModelTest::test_config` after #36077 (#36230)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/llava/test_modeling_llava.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index b47423a02ec7..347b147c769a 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -198,6 +198,13 @@ def setUp(self):
         )
 
     def test_config(self):
+        # overwritten from `tests/test_configuration_common.py::ConfigTester` after #36077
+        # TODO: avoid overwritten once there is a better fix for #36077
+        def check_config_can_be_init_without_params():
+            config = self.config_tester.config_class()
+            self.config_tester.parent.assertIsNotNone(config)
+
+        self.config_tester.check_config_can_be_init_without_params = check_config_can_be_init_without_params
         self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs

From 936aeb70abe14cfbc70fb4d4d0c7b1864a21cc54 Mon Sep 17 00:00:00 2001
From: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:50:49 +0100
Subject: [PATCH 03/14] AMD DeepSpeed image additional HIP dependencies
 (#36195)

* Add hipsolver and hipblastlt as dependencies

* Upgrade torch libs with rocm6.2.4 index
---
 .../Dockerfile                                       | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index a8edb8ff03eb..f70b15494100 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -2,10 +2,10 @@ FROM rocm/dev-ubuntu-22.04:6.2.4
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.5.1'
-ARG TORCH_VISION='0.20.0'
-ARG TORCH_AUDIO='2.5.0'
-ARG ROCM='6.2'
+ARG PYTORCH='2.6.0'
+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+ARG ROCM='6.2.4'
 
 RUN apt update && \
     apt install -y --no-install-recommends \
@@ -16,9 +16,11 @@ RUN apt update && \
     python-is-python3 \
     rocrand-dev \
     rocthrust-dev \
+    rocblas-dev \
+    hipsolver-dev \
     hipsparse-dev \
     hipblas-dev \
-    rocblas-dev && \
+    hipblaslt-dev && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 

From dad513e0c2a93c6f261be73dd0f648acb8a25c2b Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 17 Feb 2025 13:55:03 +0000
Subject: [PATCH 04/14] [generate] remove cache v4.47 deprecations (#36212)

---
 src/transformers/cache_utils.py               | 26 +++++--------------
 src/transformers/generation/utils.py          | 13 ++++------
 tests/models/phimoe/test_modeling_phimoe.py   |  2 ++
 .../qwen2_5_vl/test_modeling_qwen2_5_vl.py    |  5 ++++
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 427e1d4e3aea..07d4654c35aa 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -363,8 +363,7 @@ class DynamicCache(Cache):
         ```
     """
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
+    def __init__(self) -> None:
         super().__init__()
         self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
         self.key_cache: List[torch.Tensor] = []
@@ -466,10 +465,7 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
         return legacy_cache
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
-    ) -> "DynamicCache":
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
         """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
         backward compatibility."""
         cache = cls()
@@ -495,10 +491,7 @@ def crop(self, max_length: int):
                 self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
                 self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
-    ) -> List["DynamicCache"]:
+    def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         out = []
@@ -511,8 +504,7 @@ def batch_split(
         return out
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
+    def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
         cache = cls()
@@ -1527,10 +1519,7 @@ def crop(self, maximum_length: int):
         self.check_dynamic_cache(self.crop.__name__)
         self.self_attention_cache.crop(maximum_length)
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
-    ) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         self.check_dynamic_cache(self.batch_split.__name__)
@@ -1543,10 +1532,7 @@ def batch_split(
         return out
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_batch_splits(
-        cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
-    ) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
         self_attention_cache = DynamicCache()
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index db8bbe50e508..9760b37dea3c 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4520,7 +4520,7 @@ def _ranking_fast(
     return selected_idx
 
 
-def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int = None):
+def _split(data, full_batch_size: int, split_size: int = None):
     """
     Takes care of three cases:
     1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
@@ -4538,7 +4538,7 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
     elif isinstance(data, DynamicCache) or (
         isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
     ):
-        return data.batch_split(full_batch_size, split_size, num_hidden_layers)
+        return data.batch_split(full_batch_size, split_size)
     elif isinstance(data, tuple):
         # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
         if isinstance(data[0], tuple):
@@ -4591,11 +4591,9 @@ def _split_model_inputs(
     keys_to_ignore = ["cache_position", "encoder_outputs", "logits_to_keep"]
     non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
 
-    num_hidden_layers = config.get_text_config().num_hidden_layers
-
     # we split the tensors and tuples of tensors
     data_split_list = [
-        {k: _split(model_input[k], full_batch_size, num_hidden_layers, split_size)[i] for k in non_bool_keys}
+        {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
         for i in range(full_batch_size // split_size)
     ]
     # bool values are the same and replicated for each split
@@ -4632,7 +4630,6 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
 
     # Infer the class from the first object in the list
     model_output_cls = type(model_outputs[0])
-    num_hidden_layers = config.get_text_config().num_hidden_layers
 
     # Ensure all objects are of the same type
     if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
@@ -4649,9 +4646,9 @@ def _concat(data):
             return torch.cat(data, dim=0)
         # New cache format
         elif isinstance(data[0], DynamicCache):
-            return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+            return DynamicCache.from_batch_splits(data)
         elif isinstance(data[0], EncoderDecoderCache):
-            return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+            return EncoderDecoderCache.from_batch_splits(data)
         elif isinstance(data[0], tuple):
             # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
             if isinstance(data[0][0], tuple):
diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index 40448a0a85e8..b3dc1eba6826 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -22,6 +22,7 @@
 
 from transformers import PhimoeConfig, StaticCache, is_torch_available, set_seed
 from transformers.testing_utils import (
+    is_flaky,
     require_torch,
     slow,
     torch_device,
@@ -449,6 +450,7 @@ def test_model_rope_scaling_from_config(self, scaling_type):
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
     @parameterized.expand([("longrope",)])
+    @is_flaky()  # TODO (joao): unify rope tests in the mixin
     def test_model_rope_scaling_short_long_factor(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         n_factors = config.hidden_size // config.num_key_value_heads // 2
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index cfcfd3a620c9..dcb0816a0d0a 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -27,6 +27,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -347,6 +348,10 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
     def test_generate_compile_fullgraph(self):
         pass
 
+    @is_flaky()  # TODO (joao/raushan): Investigate why this test is flaky on this model
+    def test_prompt_lookup_decoding_matches_greedy_search(self):
+        super().test_prompt_lookup_decoding_matches_greedy_search()
+
 
 @require_torch
 class Qwen2_5_VLIntegrationTest(unittest.TestCase):

From 7ec35bc3bdc160b9461b271f822980a292ef893b Mon Sep 17 00:00:00 2001
From: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com>
Date: Mon, 17 Feb 2025 14:57:50 +0100
Subject: [PATCH 05/14] Add missing atol to torch.testing.assert_close where
 rtol is specified (#36234)

---
 tests/models/informer/test_modeling_informer.py                 | 2 +-
 tests/models/patchtst/test_modeling_patchtst.py                 | 2 +-
 .../test_modeling_time_series_transformer.py                    | 2 +-
 tests/models/wavlm/test_modeling_wavlm.py                       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 5415717cd4b8..4551abd2146e 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -546,4 +546,4 @@ def test_seq_to_seq_generation(self):
 
         expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1)
+        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1)
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 0f6f019dc3ef..0956386f0d3f 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -385,4 +385,4 @@ def test_regression_generation(self):
             device=torch_device,
         )
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[-5:], expected_slice, rtol=TOLERANCE)
+        torch.testing.assert_close(mean_prediction[-5:], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index c886bb08856c..8dcdfd8ae7d9 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -554,4 +554,4 @@ def test_seq_to_seq_generation(self):
 
         expected_slice = torch.tensor([2825.2749, 3584.9207, 6763.9951], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1)
+        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1)
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index ed02c6aa1419..cf20726ff3e4 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -549,7 +549,7 @@ def test_inference_large(self):
             [[[0.2122, 0.0500], [0.2118, 0.0563]], [[0.1353, 0.1818], [0.2453, 0.0595]]]
         )
 
-        torch.testing.assert_close(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, rtol=5e-2)
+        torch.testing.assert_close(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, rtol=5e-2, atol=5e-2)
 
     def test_inference_diarization(self):
         model = WavLMForAudioFrameClassification.from_pretrained("microsoft/wavlm-base-plus-sd").to(torch_device)

From c877c9fa5bc5ef32310dd2ba5115e0e9d9862f95 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 17 Feb 2025 15:21:20 +0100
Subject: [PATCH 06/14] v4.45.0-dev0

---
 examples/flax/question-answering/run_qa.py                      | 2 +-
 .../speech-recognition/run_flax_speech_recognition_seq2seq.py   | 2 +-
 examples/flax/text-classification/run_flax_glue.py              | 2 +-
 examples/flax/token-classification/run_flax_ner.py              | 2 +-
 .../pytorch/audio-classification/run_audio_classification.py    | 2 +-
 examples/pytorch/contrastive-image-text/run_clip.py             | 2 +-
 .../pytorch/image-classification/run_image_classification.py    | 2 +-
 .../image-classification/run_image_classification_no_trainer.py | 2 +-
 examples/pytorch/image-pretraining/run_mae.py                   | 2 +-
 examples/pytorch/image-pretraining/run_mim.py                   | 2 +-
 examples/pytorch/image-pretraining/run_mim_no_trainer.py        | 2 +-
 .../pytorch/instance-segmentation/run_instance_segmentation.py  | 2 +-
 .../run_instance_segmentation_no_trainer.py                     | 2 +-
 examples/pytorch/language-modeling/run_clm.py                   | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py        | 2 +-
 examples/pytorch/language-modeling/run_fim.py                   | 2 +-
 examples/pytorch/language-modeling/run_fim_no_trainer.py        | 2 +-
 examples/pytorch/language-modeling/run_mlm.py                   | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py        | 2 +-
 examples/pytorch/language-modeling/run_plm.py                   | 2 +-
 examples/pytorch/multiple-choice/run_swag.py                    | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py         | 2 +-
 examples/pytorch/object-detection/run_object_detection.py       | 2 +-
 .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +-
 examples/pytorch/question-answering/run_qa.py                   | 2 +-
 examples/pytorch/question-answering/run_qa_beam_search.py       | 2 +-
 .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py        | 2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py           | 2 +-
 .../pytorch/semantic-segmentation/run_semantic_segmentation.py  | 2 +-
 .../run_semantic_segmentation_no_trainer.py                     | 2 +-
 .../pytorch/speech-recognition/run_speech_recognition_ctc.py    | 2 +-
 .../speech-recognition/run_speech_recognition_ctc_adapter.py    | 2 +-
 .../speech-recognition/run_speech_recognition_seq2seq.py        | 2 +-
 examples/pytorch/summarization/run_summarization.py             | 2 +-
 examples/pytorch/summarization/run_summarization_no_trainer.py  | 2 +-
 examples/pytorch/text-classification/run_classification.py      | 2 +-
 examples/pytorch/text-classification/run_glue.py                | 2 +-
 examples/pytorch/text-classification/run_glue_no_trainer.py     | 2 +-
 examples/pytorch/text-classification/run_xnli.py                | 2 +-
 examples/pytorch/token-classification/run_ner.py                | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py     | 2 +-
 examples/pytorch/translation/run_translation.py                 | 2 +-
 examples/pytorch/translation/run_translation_no_trainer.py      | 2 +-
 examples/tensorflow/contrastive-image-text/run_clip.py          | 2 +-
 .../tensorflow/image-classification/run_image_classification.py | 2 +-
 examples/tensorflow/multiple-choice/run_swag.py                 | 2 +-
 examples/tensorflow/question-answering/run_qa.py                | 2 +-
 examples/tensorflow/summarization/run_summarization.py          | 2 +-
 examples/tensorflow/text-classification/run_glue.py             | 2 +-
 examples/tensorflow/translation/run_translation.py              | 2 +-
 setup.py                                                        | 2 +-
 src/transformers/__init__.py                                    | 2 +-
 53 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 692a43f9d23c..20db5fd02fea 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index ae106c726431..81a7d49765c2 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
 
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 0c4edda3bd4a..71fdd16f3e92 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -56,7 +56,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index bc83fb53e212..44f43f3f64a5 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -57,7 +57,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index d9cb0187e4d8..986fd94fa7fb 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index f8170bb416dd..28880b91d86f 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 32d85b7d98da..36157533130b 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -57,7 +57,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 0dc3e11f08ab..a20803cede42 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 6308e250f5c3..6e7244f43300 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 3721dc267c22..9b94e9140ed2 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 0d8496bb3fed..f7624a3d3989 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
index 306e8085f676..aa5114b35c95 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -46,7 +46,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
index e9cd01610bb7..d36163a307b8 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -52,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 54098f5a7dd6..60b8ea1f6680 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index e35e4e7d907e..456062a3c3f7 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index e94690eaa7fc..a7c84467ae1c 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -58,7 +58,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 64e340a62a76..c8aae5e0e6c4 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index fde2980d3ab5..9cac0b363236 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 40265efcfdae..a98f31bcdbac 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 6ef17ebb9b6a..1ec9182c9457 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index cc632480fbdd..79f103b0f1a3 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 4119342163d6..8dcd54908ff9 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index 095b41a6a491..464688054b31 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
 
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index b7ca051949e1..6d537ea4dbf6 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 81fcc7b8b70b..80dc2dc405ae 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 4b199a9e8990..66c4abd4c48d 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 312d8b389dd6..11871e0bf630 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index da448c37f2c4..d0462178ef66 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index cd6204c467c2..8442edcc2189 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 0551b5f61f10..09812e6308a1 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 8a0d9de748f9..03c78b482e7a 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 70fc035fabb8..47352e22bccb 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index a6e3d9f7f33f..062fddffb18e 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 6d3950802c83..0d222ff61a3f 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 93036a7e03ac..1b28275e419a 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index c2201840cacc..b598183f26d3 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index fef77f4108a3..cfee8ba50bdc 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 9f1c93f3df2e..b5d5aeb77063 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 61bd746f0782..2421da77eed2 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 0c7d2c44b878..7927c03a69c6 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 81340dc2eef8..21a5e0c98d7c 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 5b8c4c80ef89..8fe01b08f871 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 1d8ff8d05e53..4fbf11790a43 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 502132143046..f062303b5882 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index bc656ba6ff1c..2f95376b8286 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 6e4be8dcb056..f79b8896b22d 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 4c9a3ad78913..893051bc67d0 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 46f0470d1c56..55ce7e878e86 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -62,7 +62,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 3fd823ec8c08..cadfec2f981e 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 0fbe3790e058..b2097698a212 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 148ee55f26c3..0c41e8eb015d 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.50.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index 39ddbf7f1852..cb77a167442f 100644
--- a/setup.py
+++ b/setup.py
@@ -437,7 +437,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.49.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.50.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 72728b2e2797..f9dcee68c2eb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.49.0.dev0"
+__version__ = "4.50.0.dev0"
 
 from typing import TYPE_CHECKING
 

From 55493f13906acaa6fc1b90601098c50c3d0cb6a5 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 17 Feb 2025 14:59:22 +0000
Subject: [PATCH 07/14] [tests] remove tf/flax tests in `/generation` (#36235)

---
 tests/generation/test_flax_logits_process.py  | 343 ---------
 tests/generation/test_flax_utils.py           | 313 --------
 tests/generation/test_framework_agnostic.py   | 688 ------------------
 tests/generation/test_tf_logits_process.py    | 487 -------------
 tests/generation/test_tf_utils.py             | 245 -------
 tests/generation/test_utils.py                | 452 ++++++++++--
 tests/models/bart/test_modeling_flax_bart.py  |   3 +-
 .../test_modeling_flax_blenderbot.py          |   3 +-
 .../test_modeling_flax_blenderbot_small.py    |   3 +-
 .../models/bloom/test_modeling_flax_bloom.py  |   3 +-
 .../models/gemma/test_modeling_flax_gemma.py  |   3 +-
 tests/models/gpt2/test_modeling_flax_gpt2.py  |   3 +-
 .../gpt_neo/test_modeling_flax_gpt_neo.py     |   3 +-
 tests/models/gptj/test_modeling_flax_gptj.py  |   3 +-
 .../models/llama/test_modeling_flax_llama.py  |   3 +-
 .../longt5/test_modeling_flax_longt5.py       |   3 +-
 .../marian/test_modeling_flax_marian.py       |   3 +-
 .../models/mbart/test_modeling_flax_mbart.py  |   3 +-
 .../mistral/test_modeling_flax_mistral.py     |   3 +-
 .../mistral/test_modeling_tf_mistral.py       |   3 +-
 tests/models/opt/test_modeling_flax_opt.py    |   3 +-
 .../test_modeling_tf_speech_to_text.py        |  76 --
 tests/models/t5/test_modeling_flax_t5.py      |   3 +-
 .../whisper/test_modeling_tf_whisper.py       | 121 ---
 tests/models/xglm/test_modeling_flax_xglm.py  |   3 +-
 tests/test_modeling_tf_common.py              | 315 --------
 26 files changed, 428 insertions(+), 2663 deletions(-)
 delete mode 100644 tests/generation/test_flax_logits_process.py
 delete mode 100644 tests/generation/test_flax_utils.py
 delete mode 100644 tests/generation/test_framework_agnostic.py
 delete mode 100644 tests/generation/test_tf_logits_process.py
 delete mode 100644 tests/generation/test_tf_utils.py

diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py
deleted file mode 100644
index bd5f8f648cbb..000000000000
--- a/tests/generation/test_flax_logits_process.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.testing_utils import require_flax
-
-from ..test_modeling_flax_common import ids_tensor
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.generation import (
-        FlaxForcedBOSTokenLogitsProcessor,
-        FlaxForcedEOSTokenLogitsProcessor,
-        FlaxLogitsProcessorList,
-        FlaxMinLengthLogitsProcessor,
-        FlaxNoRepeatNGramLogitsProcessor,
-        FlaxTemperatureLogitsWarper,
-        FlaxTopKLogitsWarper,
-        FlaxTopPLogitsWarper,
-    )
-
-
-@require_flax
-class LogitsProcessorTest(unittest.TestCase):
-    def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = jnp.ones((batch_size, length)) / length
-        return scores
-
-    def test_temperature_dist_warper(self):
-        input_ids = None
-        length = 20
-
-        scores = self._get_uniform_logits(batch_size=2, length=length)
-
-        # tweak scores to not be uniform anymore
-        scores = scores.at[1, 5].set((1 / length) + 0.1)  # peak, 1st batch
-        scores = scores.at[1, 10].set((1 / length) - 0.4)  # valley, 1st batch
-
-        # compute softmax
-        probs = jax.nn.softmax(scores, axis=-1)
-
-        temp_dist_warper_sharper = FlaxTemperatureLogitsWarper(temperature=0.5)
-        temp_dist_warper_smoother = FlaxTemperatureLogitsWarper(temperature=1.3)
-
-        warped_prob_sharp = jax.nn.softmax(temp_dist_warper_sharper(input_ids, scores.copy(), cur_len=None), axis=-1)
-        warped_prob_smooth = jax.nn.softmax(temp_dist_warper_smoother(input_ids, scores.copy(), cur_len=None), axis=-1)
-
-        # uniform distribution stays uniform
-        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
-        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
-
-        # sharp peaks get higher, valleys get lower
-        self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
-        self.assertGreater(probs[1, :].min(), warped_prob_sharp[1, :].min())
-
-        # smooth peaks get lower, valleys get higher
-        self.assertGreater(probs[1, :].max(), warped_prob_smooth[1, :].max())
-        self.assertLess(probs[1, :].min(), warped_prob_smooth[1, :].min())
-
-    def test_top_k_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create ramp distribution
-        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy()
-        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
-
-        top_k_warp = FlaxTopKLogitsWarper(3)
-
-        scores = top_k_warp(input_ids, ramp_logits, cur_len=None)
-
-        # check that correct tokens are filtered
-        self.assertListEqual(jnp.isinf(scores[0]).tolist(), 7 * [True] + 3 * [False])
-        self.assertListEqual(jnp.isinf(scores[1]).tolist(), 2 * [True] + 3 * [False] + 5 * [True])
-
-        # check special case
-        length = 5
-        top_k_warp_safety_check = FlaxTopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
-
-        ramp_logits = np.broadcast_to(np.arange(length)[None, :], (batch_size, length)).copy()
-        scores = top_k_warp_safety_check(input_ids, ramp_logits, cur_len=None)
-
-        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
-        self.assertListEqual((scores == 0.0).sum(axis=-1).tolist(), [2, 2])
-
-    def test_top_p_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = np.log(np.array([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]]))
-
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        filtered_dist = np.exp(top_p_warp(input_ids, dist, cur_len=None))
-
-        # dist should be filtered to keep min num values so that sum is >= top_p
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = np.array([[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]])
-        self.assertTrue(np.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy() - (
-            vocab_size // 2
-        )
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        top_p_warp = FlaxTopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = top_p_warp(input_ids, ramp_logits, cur_len=None)
-
-        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).sum(axis=-1).tolist(), [3, 2])
-
-    def test_min_length_dist_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        min_dist_processor = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-
-        # check that min length is applied at length 5
-        input_ids = ids_tensor((batch_size, 20), vocab_size=20)
-        cur_len = 5
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len=cur_len)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), 4 * [-float("inf")])
-
-        # check that min length is not applied anymore at length 15
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        cur_len = 15
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores_before_min_length).any())
-
-    def test_forced_bos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        bos_token_id = 0
-
-        logits_processor = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-
-        # check that all scores are -inf except the bos_token_id score
-        input_ids = ids_tensor((batch_size, 1), vocab_size=20)
-        cur_len = 1
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertTrue(jnp.isneginf(scores[:, bos_token_id + 1 :]).all())
-        self.assertListEqual(scores[:, bos_token_id].tolist(), 4 * [0])  # score for bos_token_id shold be zero
-
-        # check that bos_token_id is not forced if current length is greater than 1
-        cur_len = 3
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores).any())
-
-    def test_forced_eos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-        max_length = 5
-
-        logits_processor = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        # check that all scores are -inf except the eos_token_id when max_length is reached
-        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
-        cur_len = 4
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertTrue(jnp.isneginf(scores[:, eos_token_id + 1 :]).all())
-        self.assertListEqual(scores[:, eos_token_id].tolist(), 4 * [0])  # score for eos_token_id should be zero
-
-        # check that eos_token_id is not forced if max_length is not reached
-        cur_len = 3
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores).any())
-
-    def test_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        batch_size = 2
-
-        cur_len = 4
-        input_ids = np.array([[1, 1, 2, 1], [0, 1, 0, 1]], dtype="i4")
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_repeat_proc_2_gram = FlaxNoRepeatNGramLogitsProcessor(2)
-        no_repeat_proc_3_gram = FlaxNoRepeatNGramLogitsProcessor(3)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores, cur_len=cur_len)
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores, cur_len=cur_len)
-
-        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(jnp.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [True, False, False]])
-
-        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(jnp.isinf(filtered_scores_3_gram).tolist(), [[False, False, False], [True, False, False]])
-
-    def test_processor_list(self):
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 2
-        bos_token_id = 1
-        max_length = 15
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        input_ids_comp = input_ids.copy()
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = scores.copy()
-
-        # instantiate all dist processors
-        temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5)
-        top_k_warp = FlaxTopKLogitsWarper(3)
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2)
-
-        # instantiate all logits processors
-        min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        bos_dist_proc = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        eos_dist_proc = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        cur_len = 10
-
-        # no processor list
-        scores = temp_dist_warp(input_ids, scores, cur_len=cur_len)
-        scores = top_k_warp(input_ids, scores, cur_len=cur_len)
-        scores = top_p_warp(input_ids, scores, cur_len=cur_len)
-        scores = min_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = bos_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = eos_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = no_repeat_proc(input_ids, scores, cur_len=cur_len)
-
-        # with processor list
-        processor = FlaxLogitsProcessorList(
-            [
-                temp_dist_warp,
-                top_k_warp,
-                top_p_warp,
-                min_dist_proc,
-                bos_dist_proc,
-                eos_dist_proc,
-                no_repeat_proc,
-            ]
-        )
-        scores_comp = processor(input_ids, scores_comp, cur_len=cur_len)
-
-        # scores should be equal
-        self.assertTrue(jnp.allclose(scores, scores_comp, atol=1e-3))
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
-
-    def test_processor_list_jitted(self):
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 2
-        bos_token_id = 1
-        max_length = 15
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        input_ids_comp = input_ids.copy()
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = scores.copy()
-
-        # instantiate all dist processors
-        temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5)
-        top_k_warp = FlaxTopKLogitsWarper(3)
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2)
-
-        # instantiate all logits processors
-        min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        bos_dist_proc = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        eos_dist_proc = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        cur_len = 10
-
-        # no processor list
-        def run_no_processor_list(input_ids, scores, cur_len):
-            scores = temp_dist_warp(input_ids, scores, cur_len=cur_len)
-            scores = top_k_warp(input_ids, scores, cur_len=cur_len)
-            scores = top_p_warp(input_ids, scores, cur_len=cur_len)
-            scores = min_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = bos_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = eos_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = no_repeat_proc(input_ids, scores, cur_len=cur_len)
-            return scores
-
-        # with processor list
-        def run_processor_list(input_ids, scores, cur_len):
-            processor = FlaxLogitsProcessorList(
-                [
-                    temp_dist_warp,
-                    top_k_warp,
-                    top_p_warp,
-                    min_dist_proc,
-                    bos_dist_proc,
-                    eos_dist_proc,
-                    no_repeat_proc,
-                ]
-            )
-            scores = processor(input_ids, scores, cur_len=cur_len)
-            return scores
-
-        jitted_run_no_processor_list = jax.jit(run_no_processor_list)
-        jitted_run_processor_list = jax.jit(run_processor_list)
-
-        scores = jitted_run_no_processor_list(input_ids, scores, cur_len)
-        scores_comp = jitted_run_processor_list(input_ids, scores_comp, cur_len)
-
-        # scores should be equal
-        self.assertTrue(jnp.allclose(scores, scores_comp, atol=1e-3))
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
deleted file mode 100644
index 302617c6688d..000000000000
--- a/tests/generation/test_flax_utils.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-
-import transformers
-from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax
-
-
-if is_flax_available():
-    import os
-
-    import jax.numpy as jnp
-    from jax import jit
-
-    from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
-
-    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
-
-
-if is_torch_available():
-    import torch
-
-
-def ids_tensor(shape, vocab_size, rng=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = np.array(values, dtype=jnp.int32).reshape(shape)
-
-    return output
-
-
-def random_attention_mask(shape, rng=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=rng)
-    # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
-    return attn_mask
-
-
-@require_flax
-class FlaxGenerationTesterMixin:
-    model_tester = None
-
-    def _get_input_ids_and_config(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # cut to half length & take max batch_size 3
-        max_batch_size = 2
-        sequence_length = inputs["input_ids"].shape[-1] // 2
-        input_ids = inputs["input_ids"][:max_batch_size, :sequence_length]
-
-        attention_mask = jnp.ones_like(input_ids)
-        attention_mask = attention_mask[:max_batch_size, :sequence_length]
-
-        # generate max 5 tokens
-        max_length = input_ids.shape[-1] + 5
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-        return config, input_ids, attention_mask, max_length
-
-    @is_pt_flax_cross_test
-    def test_greedy_generate_pt_fx(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.decoder_start_token_id = 0
-
-        for model_class in self.all_generative_model_classes:
-            flax_model = model_class(config)
-
-            pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, flax_model.params)
-
-            # Generate max 5 tokens only otherwise seems to be numerical error accumulation
-            pt_model.generation_config.max_length = 5
-            flax_model.generation_config.max_length = 5
-
-            flax_generation_outputs = flax_model.generate(input_ids).sequences
-            pt_generation_outputs = pt_model.generate(torch.tensor(input_ids, dtype=torch.long))
-
-            if flax_generation_outputs.shape[-1] > pt_generation_outputs.shape[-1]:
-                flax_generation_outputs = flax_generation_outputs[:, : pt_generation_outputs.shape[-1]]
-
-            self.assertListEqual(pt_generation_outputs.numpy().tolist(), flax_generation_outputs.tolist())
-
-    def test_greedy_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_sample_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = True
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.num_beams = 2
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_num_return_sequences(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.num_beams = 2
-        config.num_return_sequences = 2
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[0], input_ids.shape[0] * config.num_return_sequences)
-
-    def test_sample_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = True
-        config.max_length = max_length
-        config.temperature = 0.8
-        config.top_k = 10
-        config.top_p = 0.3
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_greedy_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.max_length = max_length
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.max_length = max_length
-        config.num_beams = 2
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_greedy_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.do_sample = False
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_sample_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.do_sample = True
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.num_beams = 2
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-
-@require_flax
-class FlaxGenerationIntegrationTests(unittest.TestCase):
-    def test_validate_generation_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-bert")
-        model = FlaxAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors="np").input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
deleted file mode 100644
index 634824c2b38e..000000000000
--- a/tests/generation/test_framework_agnostic.py
+++ /dev/null
@@ -1,688 +0,0 @@
-"""
-Framework agnostic tests for generate()-related methods.
-"""
-
-import numpy as np
-
-from transformers import AutoTokenizer
-from transformers.testing_utils import slow, torch_device
-
-
-class GenerationIntegrationTestsMixin:
-    # To be populated by the child classes
-    framework_dependent_parameters = {
-        "AutoModelForCausalLM": None,
-        "AutoModelForSpeechSeq2Seq": None,
-        "AutoModelForSeq2SeqLM": None,
-        "AutoModelForVision2Seq": None,
-        "LogitsProcessorList": None,
-        "MinLengthLogitsProcessor": None,
-        "create_tensor_fn": None,
-        "floats_tensor": None,
-        "return_tensors": None,
-        "set_seed": None,
-    }
-
-    def test_validate_generation_inputs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors=return_tensors).input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
-
-        # however, valid model_kwargs are accepted
-        valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
-        model.generate(input_ids, **valid_model_kwargs)
-
-    def test_custom_logits_processor(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
-        min_length_logits_processor_cls = self.framework_dependent_parameters["MinLengthLogitsProcessor"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-
-        logits_processor = logits_processor_list_cls()
-        logits_processor.append(min_length_logits_processor_cls(min_length=10, eos_token_id=0))
-        # it should not be allowed to both define `min_length` via config and `logits_processor` list
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, logits_processor=logits_processor)
-
-        bart_model.config.min_length = None
-        bart_model.generate(input_ids, logits_processor=logits_processor)
-
-    def test_max_new_tokens_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            bart_model = bart_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 29])
-
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 29 (input length) + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 33])
-
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_max_new_tokens_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            gpt2_model = gpt2_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_encoder_decoder_generate_with_inputs_embeds(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        output_sequences = model.generate(inputs_embeds=inputs_embeds)
-
-        # make sure model generated correctly until `max_length`
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_transition_scores_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilbert/distilgpt2")
-        model.generation_config.eos_token_id = None
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
-                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_greedy_search_normalized(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilbert/distilgpt2")
-        model.generation_config.eos_token_id = None
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
-                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake",
-            "Michael Phelps",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_sample_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            do_sample=True,
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_transition_scores_early_stopping(self):
-        # This is an aggressive test that makes sure that `beam_search's`
-        # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
-        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
-        model = model_cls.from_pretrained("google-t5/t5-small")
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            max_length=10,
-            return_dict_in_generate=True,
-            output_scores=True,
-            forced_eos_token_id=model.config.eos_token_id,
-            num_beams=4,
-            do_sample=False,
-            num_return_sequences=3,
-            length_penalty=0.0,
-        )
-
-        transition_scores = model.compute_transition_scores(
-            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
-        )
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
-
-    def test_encoder_decoder_generate_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        # need extreme generation values here to force this test
-        # to fail when `attention_mask` is not correctly treated in generate
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(articles[0], return_tensors=return_tensors).input_ids
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-            input_ids_batched = input_ids_batched.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
-
-        batched_out = output_sequences_batched.sequences_scores
-        out = output_sequences.sequences_scores
-        if is_pt:
-            batched_out = batched_out.cpu().numpy()
-            out = out.cpu().numpy()
-
-        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
-        self.assertTrue(diff < 1e-4)
-
-    def test_generate_input_ids_as_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 15))
-
-    def test_generate_input_ids_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_generate_inputs_and_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, input_ids=input_ids)
-
-    def test_generate_too_many_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
-
-    def test_generate_input_features_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
-        output_sequences = model.generate(input_features, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (3, 5))
-
-    def test_generate_pixel_values_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        model.generation_config.eos_token_id = None
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5)
-        output_sequences = model.generate(pixel_values, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_encoder_outputs_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        attention_mask = create_tensor_fn(np.ones(input_features.shape))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features = input_features.to(torch_device)
-            attention_mask = attention_mask.to(torch_device)
-            model = model.to(torch_device)
-
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(input_features)
-
-        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
-        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
-        if is_pt:
-            output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
-            output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
-
-    def test_eos_token_id_int_and_list_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_contrastive_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-            "penalty_alpha": 0.6,
-            "top_k": 4,
-        }
-        expectation = 17
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 225
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [225, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_beam_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 3,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-    def test_generate_vision2text_conditioning(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-            conditioning_input = conditioning_input.to(torch_device)
-
-        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
-        # decoder_input_ids, if the encoder is not a model with text input)
-        output_sequences_decoder_input_ids = model.generate(
-            pixel_values, max_length=5, decoder_input_ids=conditioning_input
-        )
-        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
-        if is_pt:
-            output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
-            output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
-            conditioning_input = conditioning_input.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py
deleted file mode 100644
index f06f5695b1ce..000000000000
--- a/tests/generation/test_tf_logits_process.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.generation import (
-        TFForcedBOSTokenLogitsProcessor,
-        TFForcedEOSTokenLogitsProcessor,
-        TFForceTokensLogitsProcessor,
-        TFLogitsProcessorList,
-        TFMinLengthLogitsProcessor,
-        TFNoBadWordsLogitsProcessor,
-        TFNoRepeatNGramLogitsProcessor,
-        TFRepetitionPenaltyLogitsProcessor,
-        TFSuppressTokensAtBeginLogitsProcessor,
-        TFSuppressTokensLogitsProcessor,
-        TFTemperatureLogitsWarper,
-        TFTopKLogitsWarper,
-        TFTopPLogitsWarper,
-    )
-
-    from ..test_modeling_tf_common import ids_tensor
-
-
-@require_tf
-class TFLogitsProcessorTest(unittest.TestCase):
-    def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = tf.ones((batch_size, length), dtype=tf.float32) / length
-        return scores
-
-    @parameterized.expand([(False,), (True,)])
-    def test_min_length_dist_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        if use_xla:
-            min_dist_processor = tf.function(min_dist_processor, jit_compile=True)
-
-        # check that min length is applied at length 5
-        cur_len = 5
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")])
-
-        # check that min length is not applied anymore at length 15
-        cur_len = 15
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy())
-
-    @parameterized.expand([(False,), (True,)])
-    def test_temperature_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        length = 20
-
-        scores = self._get_uniform_logits(batch_size=2, length=length)
-
-        # tweak scores to not be uniform anymore
-        scores = scores.numpy()
-        scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
-        scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
-        scores = tf.convert_to_tensor(scores)
-
-        # compute softmax
-        probs = tf.nn.softmax(scores, axis=-1)
-
-        temp_dist_warper_sharper = TFTemperatureLogitsWarper(temperature=0.5)
-        temp_dist_warper_smoother = TFTemperatureLogitsWarper(temperature=1.3)
-        if use_xla:
-            temp_dist_warper_sharper = tf.function(temp_dist_warper_sharper, jit_compile=True)
-            temp_dist_warper_smoother = tf.function(temp_dist_warper_smoother, jit_compile=True)
-
-        warped_prob_sharp = tf.nn.softmax(temp_dist_warper_sharper(input_ids, tf.identity(scores), cur_len), axis=-1)
-        warped_prob_smooth = tf.nn.softmax(temp_dist_warper_smoother(input_ids, tf.identity(scores), cur_len), axis=-1)
-
-        # uniform distribution stays uniform
-        tf.debugging.assert_near(probs[0, :], warped_prob_sharp[0, :], atol=1e-3)
-        tf.debugging.assert_near(probs[0, :], warped_prob_smooth[0, :], atol=1e-3)
-
-        # sharp peaks get higher, valleys get lower
-        self.assertLess(tf.math.reduce_max(probs[1, :]), tf.math.reduce_max(warped_prob_sharp[1, :]))
-        self.assertGreater(tf.math.reduce_min(probs[1, :]), tf.math.reduce_min(warped_prob_sharp[1, :]))
-
-        # smooth peaks get lower, valleys get higher
-        self.assertGreater(tf.math.reduce_max(probs[1, :]), tf.math.reduce_max(warped_prob_smooth[1, :]))
-        self.assertLess(tf.math.reduce_min(probs[1, :]), tf.math.reduce_min(warped_prob_smooth[1, :]))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_repetition_penalty_dist_process(self, use_xla):
-        vocab_size = 10
-        cur_len = 2
-
-        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
-
-        mask = tf.cast(tf.constant([[1] + 9 * [0], 10 * [0]]), tf.bool)
-        scores = tf.where(mask, -1 / vocab_size, scores)
-        mask = tf.cast(tf.constant([10 * [0], 5 * [0] + [1] + 4 * [0]]), tf.bool)
-        scores = tf.where(mask, 4 / vocab_size, scores)
-        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
-        if use_xla:
-            rep_penalty_proc = tf.function(rep_penalty_proc, jit_compile=True)
-
-        scores = rep_penalty_proc(input_ids, tf.identity(scores), cur_len)
-
-        # check that values were correctly changed (negative scores for used tokens should increase, others
-        # should decrease)
-        self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2)
-        self.assertAlmostEqual(scores[0, 1].numpy(), (1 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[0, 2].numpy(), (1 / vocab_size))  # unused tokens should see no change
-
-        self.assertAlmostEqual(scores[1, 0].numpy(), (1 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[1, 5].numpy(), (4 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[0, 2].numpy(), (1 / vocab_size))  # unused tokens should see no change
-
-    @parameterized.expand([(False,), (True,)])
-    def test_top_k_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create ramp distribution
-        ramp_logits = np.broadcast_to(np.arange(vocab_size, dtype=np.float32), (batch_size, vocab_size)).copy()
-        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
-
-        top_k_warp = TFTopKLogitsWarper(3)
-        if use_xla:
-            top_k_warp = tf.function(top_k_warp, jit_compile=True)
-
-        scores = top_k_warp(input_ids, ramp_logits, cur_len)
-
-        # check that correct tokens are filtered
-        self.assertListEqual(tf.math.is_inf(scores[0]).numpy().tolist(), 7 * [True] + 3 * [False])
-        self.assertListEqual(tf.math.is_inf(scores[1]).numpy().tolist(), 2 * [True] + 3 * [False] + 5 * [True])
-
-        # check special cases
-        length = 5
-
-        logits = self._get_uniform_logits(batch_size=batch_size, length=length)
-        top_k_warp_safety_check = TFTopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
-        if use_xla:
-            top_k_warp_safety_check = tf.function(top_k_warp_safety_check, jit_compile=True)
-
-        scores = top_k_warp_safety_check(input_ids, logits, cur_len)
-        # uniform dist is not changed
-        self.assertListEqual(tf.math.reduce_sum(tf.where(scores == 0.0, 1, 0), axis=-1).numpy().tolist(), [0, 0])
-
-        ramp_logits = np.broadcast_to(np.arange(length, dtype=np.float32), (batch_size, length)).copy()
-        scores = top_k_warp_safety_check(input_ids, ramp_logits, cur_len)
-
-        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
-        self.assertListEqual(tf.math.reduce_sum(tf.where(scores == 0.0, 1, 0), axis=-1).numpy().tolist(), [2, 2])
-
-    @parameterized.expand([(False,), (True,)])
-    def test_top_p_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TFTopPLogitsWarper)
-        dist = np.log(np.array([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]], dtype=np.float32))
-
-        # top_p should have been 0.8 to test the edge case of top_p being exactly equal to sum of some token prob
-        # However, due to the numerical instability of softmax in TF we choose this as the edge case
-        # top_p as 0.8 passes when use_xla is True and fails when False. Refer PR #18984.
-        top_p_warp = TFTopPLogitsWarper(0.79999995)
-        if use_xla:
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-        filtered_dist = tf.exp(top_p_warp(input_ids, dist, cur_len))
-
-        # dist should be filtered to keep min num values so that sum is >= top_p
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = tf.constant([[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]], dtype=tf.float32)
-        tf.debugging.assert_near(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3)
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = np.broadcast_to(
-            np.arange(vocab_size, dtype=np.float32)[None, :], (batch_size, vocab_size)
-        ).copy() - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        top_p_warp = TFTopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
-        if use_xla:
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-        filtered_dist = top_p_warp(input_ids, ramp_logits, cur_len)
-
-        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps
-        # 2.
-        self.assertListEqual(
-            tf.math.reduce_sum(tf.where(filtered_dist != 0.0, 1, 0), axis=-1).numpy().tolist(), [3, 2]
-        )
-
-    def test_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        batch_size = 2
-        cur_len = 4
-
-        input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2)
-        no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores), cur_len)
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores), cur_len)
-
-        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores_2_gram).numpy().tolist(), [[False, True, True], [True, False, False]]
-        )
-
-        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores_3_gram).numpy().tolist(), [[False, False, False], [True, False, False]]
-        )
-
-    @parameterized.expand([(False,), (True,)])
-    def test_no_bad_words_dist_processor(self, use_xla):
-        vocab_size = 5
-        batch_size = 2
-        eos_token_id = 4
-        cur_len = 4
-
-        input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
-        if use_xla:
-            no_bad_words_dist_proc = tf.function(no_bad_words_dist_proc, jit_compile=True)
-
-        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores), cur_len)
-
-        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
-        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores).numpy().tolist(),
-            [[True, True, False, True, True], [True, True, True, False, True]],
-        )
-
-    @parameterized.expand([(False,), (True,)])
-    def test_forced_bos_token_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        bos_token_id = 0
-
-        logits_processor = TFForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that all scores are -inf except the bos_token_id score
-        cur_len = 1
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(
-            tf.math.reduce_all(tf.math.is_inf(scores[:, bos_token_id + 1 :]) & (scores[:, bos_token_id + 1 :] < 0))
-        )
-        self.assertListEqual(scores[:, bos_token_id].numpy().tolist(), 4 * [0])  # score for bos_token_id shold be zero
-
-        # check that bos_token_id is not forced if current length is greater than 1
-        cur_len = 4
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_forced_eos_token_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-        max_length = 5
-
-        logits_processor = TFForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that all scores are -inf except the eos_token_id when max_length-1 is reached
-        cur_len = 4
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(
-            tf.math.reduce_all(tf.math.is_inf(scores[:, eos_token_id + 1 :]) & (scores[:, eos_token_id + 1 :] < 0))
-        )
-        self.assertListEqual(
-            scores[:, eos_token_id].numpy().tolist(), 4 * [0]
-        )  # score for eos_token_id should be zero
-
-        # check that eos_token_id is not forced if max_length-1 is not reached
-        cur_len = 3
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_suppress_tokens_at_begin_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        begin_suppress_tokens = [1, 2, 3]
-        begin_index = 5
-
-        logits_processor = TFSuppressTokensAtBeginLogitsProcessor(
-            begin_suppress_tokens=begin_suppress_tokens, begin_index=begin_index
-        )
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # Check that no scores are suppressed if begin_index is not reached
-        cur_len = 4
-        input_ids = tf.convert_to_tensor([[11, 17, 15, 8], [14, 0, 19, 5], [13, 11, 18, 19], [11, 12, 16, 15]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-        # Check that scores are suppressed if begin_index is reached
-        cur_len = 5
-        input_ids = tf.convert_to_tensor([[5, 5, 5, 0, 17], [18, 1, 9, 14, 17], [18, 6, 8, 15, 19], [8, 12, 17, 1, 2]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(tf.math.reduce_all(tf.math.is_inf(tf.gather(scores, begin_suppress_tokens, axis=1))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_suppress_tokens_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        suppress_tokens = [1, 3, 5]
-        keep_tokens = [i for i in range(vocab_size) if i not in suppress_tokens]
-
-        logits_processor = TFSuppressTokensLogitsProcessor(suppress_tokens=suppress_tokens)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # Check that suppress_tokens are suppressed and others are not
-        cur_len = 5
-        input_ids = tf.convert_to_tensor([[0, 10, 19, 6, 3], [17, 4, 8, 17, 2], [7, 1, 11, 6, 15], [5, 8, 13, 16, 0]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(tf.math.reduce_all(tf.math.is_inf(tf.gather(scores, suppress_tokens, axis=1))))
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(tf.gather(scores, keep_tokens, axis=1))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_force_tokens_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        force_token_map = {1: 2, 3: 2}
-
-        logits_processor = TFForceTokensLogitsProcessor(force_token_map=force_token_map)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that if the cur_len is contained in the force_token_map, the logits are the same
-        # for all tokens except the one the force_token_map points to
-        cur_len = 1
-        input_ids = tf.convert_to_tensor([[11], [7], [5], [15]])
-        ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        tf.debugging.assert_near(tf.gather(scores, [force_token_map[cur_len]], axis=1), 0.0)
-
-        non_forced_inds = [i for i in range(vocab_size) if i != force_token_map[cur_len]]
-        self.assertTrue(
-            tf.math.reduce_all(
-                tf.experimental.numpy.isclose(
-                    tf.gather(scores, [non_forced_inds], axis=1),
-                    tf.constant(scores.dtype.min),
-                )
-            )
-        )
-
-        # check that if the cur_len is not contained in the force_token_map, the logits are not modified
-        cur_len = 2
-        input_ids = tf.convert_to_tensor([[2, 19], [19, 15], [4, 9], [7, 6]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_processor_list(self, use_xla):
-        # TODO (Joao): reintroduce TFNoRepeatNGramLogitsProcessor when it gets compatible with XLA
-        batch_size = 4
-        cur_len = 10
-        vocab_size = 15
-        eos_token_id = 0
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size)
-        input_ids_comp = tf.identity(input_ids)
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = tf.identity(scores)
-
-        # instantiate all dist processors
-        min_dist_proc = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        temp_dist_warp = TFTemperatureLogitsWarper(temperature=0.5)
-        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
-        top_k_warp = TFTopKLogitsWarper(3)
-        top_p_warp = TFTopPLogitsWarper(0.8)
-        # no_repeat_proc = TFNoRepeatNGramLogitsProcessor(2)
-        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
-        if use_xla:
-            min_dist_proc = tf.function(min_dist_proc, jit_compile=True)
-            temp_dist_warp = tf.function(temp_dist_warp, jit_compile=True)
-            rep_penalty_proc = tf.function(rep_penalty_proc, jit_compile=True)
-            top_k_warp = tf.function(top_k_warp, jit_compile=True)
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-            # no_repeat_proc = tf.function(no_repeat_proc, jit_compile=True)
-            no_bad_words_dist_proc = tf.function(no_bad_words_dist_proc, jit_compile=True)
-
-        # no processor list
-        scores = min_dist_proc(input_ids, scores, cur_len)
-        scores = temp_dist_warp(input_ids, scores, cur_len)
-        scores = rep_penalty_proc(input_ids, scores, cur_len)
-        scores = top_k_warp(input_ids, scores, cur_len)
-        scores = top_p_warp(input_ids, scores, cur_len)
-        # scores = no_repeat_proc(input_ids, scores, cur_len)
-        scores = no_bad_words_dist_proc(input_ids, scores, cur_len)
-
-        # with processor list
-        processor = TFLogitsProcessorList(
-            [
-                min_dist_proc,
-                temp_dist_warp,
-                rep_penalty_proc,
-                top_k_warp,
-                top_p_warp,
-                # no_repeat_proc,
-                no_bad_words_dist_proc,
-            ]
-        )
-        scores_comp = processor(input_ids, scores_comp, cur_len)
-
-        # remove inf
-        scores = tf.where(tf.math.is_inf(scores), -1e9, scores)
-        scores_comp = tf.where(tf.math.is_inf(scores_comp), -1e9, scores_comp)
-
-        # scores should be equal
-        tf.debugging.assert_near(scores, scores_comp, atol=1e-3)
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.numpy().tolist(), input_ids_comp.numpy().tolist())
diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py
deleted file mode 100644
index f40ceebef76f..000000000000
--- a/tests/generation/test_tf_utils.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from transformers import is_tensorflow_text_available, is_tf_available
-from transformers.testing_utils import require_tensorflow_text, require_tf, slow
-
-from ..test_modeling_tf_common import floats_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        AutoTokenizer,
-        TFAutoModelForCausalLM,
-        TFAutoModelForSeq2SeqLM,
-        TFAutoModelForSpeechSeq2Seq,
-        TFAutoModelForVision2Seq,
-        TFBartForConditionalGeneration,
-        TFLogitsProcessorList,
-        TFMinLengthLogitsProcessor,
-    )
-    from transformers.modeling_tf_utils import keras
-
-if is_tensorflow_text_available():
-    import tensorflow_text as text
-
-
-@require_tf
-class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_tf_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": TFAutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": TFAutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": TFAutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": TFAutoModelForVision2Seq,
-            "LogitsProcessorList": TFLogitsProcessorList,
-            "MinLengthLogitsProcessor": TFMinLengthLogitsProcessor,
-            "create_tensor_fn": tf.convert_to_tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "tf",
-        }
-
-    @slow
-    def test_generate_tf_function_export_fixed_input_length(self):
-        # TF-only test: tf.saved_model export
-        test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_length = 2
-        max_new_tokens = 2
-
-        class DummyModel(tf.Module):
-            def __init__(self, model):
-                super(DummyModel, self).__init__()
-                self.model = model
-
-            @tf.function(
-                input_signature=(
-                    tf.TensorSpec((None, input_length), tf.int32, name="input_ids"),
-                    tf.TensorSpec((None, input_length), tf.int32, name="attention_mask"),
-                ),
-                jit_compile=True,
-            )
-            def serving(self, input_ids, attention_mask):
-                outputs = self.model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    return_dict_in_generate=True,
-                )
-                return {"sequences": outputs["sequences"]}
-
-        dummy_input_ids = [[2, 0], [102, 103]]
-        dummy_attention_masks = [[1, 0], [1, 1]]
-        dummy_model = DummyModel(model=test_model)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tf.saved_model.save(dummy_model, tmp_dir, signatures={"serving_default": dummy_model.serving})
-            serving_func = tf.saved_model.load(tmp_dir).signatures["serving_default"]
-            for batch_size in range(1, len(dummy_input_ids) + 1):
-                inputs = {
-                    "input_ids": tf.constant(dummy_input_ids[:batch_size]),
-                    "attention_mask": tf.constant(dummy_attention_masks[:batch_size]),
-                }
-                tf_func_outputs = serving_func(**inputs)["sequences"]
-                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
-                tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
-
-    @slow
-    def test_generate_tf_function_export_fixed_batch_size(self):
-        # TF-only test: tf.saved_model export
-        test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        batch_size = 1
-        max_new_tokens = 2
-
-        class DummyModel(tf.Module):
-            def __init__(self, model):
-                super(DummyModel, self).__init__()
-                self.model = model
-
-            @tf.function(
-                input_signature=(
-                    tf.TensorSpec((batch_size, None), tf.int32, name="input_ids"),
-                    tf.TensorSpec((batch_size, None), tf.int32, name="attention_mask"),
-                ),
-                jit_compile=True,
-            )
-            def serving(self, input_ids, attention_mask):
-                outputs = self.model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    return_dict_in_generate=True,
-                )
-                return {"sequences": outputs["sequences"]}
-
-        dummy_input_ids = [[2], [102, 103]]
-        dummy_attention_masks = [[1], [1, 1]]
-        dummy_model = DummyModel(model=test_model)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tf.saved_model.save(dummy_model, tmp_dir, signatures={"serving_default": dummy_model.serving})
-            serving_func = tf.saved_model.load(tmp_dir).signatures["serving_default"]
-            for input_row in range(len(dummy_input_ids)):
-                inputs = {
-                    "input_ids": tf.constant([dummy_input_ids[input_row]]),
-                    "attention_mask": tf.constant([dummy_attention_masks[input_row]]),
-                }
-                tf_func_outputs = serving_func(**inputs)["sequences"]
-                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
-                tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
-
-    @slow
-    @require_tensorflow_text
-    def test_generate_tf_function_export_with_tf_tokenizer(self):
-        # TF-only test: tf.saved_model export
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # file needed to load the TF tokenizer
-            hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir)
-
-            class CompleteSentenceTransformer(keras.layers.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self.tokenizer = text.SentencepieceTokenizer(
-                        model=tf.io.gfile.GFile(os.path.join(tmp_dir, "spiece.model"), "rb").read()
-                    )
-                    self.model = TFAutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-                def call(self, inputs, *args, **kwargs):
-                    tokens = self.tokenizer.tokenize(inputs)
-                    input_ids, attention_mask = text.pad_model_inputs(
-                        tokens, max_seq_length=64, pad_value=self.model.config.pad_token_id
-                    )
-                    outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
-                    return self.tokenizer.detokenize(outputs)
-
-            complete_model = CompleteSentenceTransformer()
-            inputs = keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs")
-            outputs = complete_model(inputs)
-            keras_model = keras.Model(inputs, outputs)
-            keras_model.save(tmp_dir)
-
-    def test_eos_token_id_int_and_list_top_k_top_sampling(self):
-        # Has PT equivalent: this test relies on random sampling
-        generation_kwargs = {
-            "do_sample": True,
-            "num_beams": 1,
-            "top_p": 0.7,
-            "top_k": 10,
-            "temperature": 0.7,
-        }
-        expectation = 14
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="tf")
-        model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        eos_token_id = 638
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            tf.random.set_seed(0)
-            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [638, 198]
-        with tf.device(":/CPU:0"):
-            tf.random.set_seed(0)
-            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_model_kwarg_encoder_signature_filtering(self):
-        # Has PT equivalent: ample use of framework-specific code
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Hugging Face is a technology company based in New York and Paris."""
-        input_ids = bart_tokenizer(article, return_tensors="tf").input_ids
-        bart_model = TFBartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-        output = bart_model.generate(input_ids).numpy()
-
-        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
-        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
-        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
-        # saves the day.
-        class FakeBart(TFBartForConditionalGeneration):
-            def call(self, input_ids, foo=None, **kwargs):
-                return super().call(input_ids, **kwargs)
-
-        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart")
-        fake_output = bart_model.generate(input_ids, foo="bar").numpy()
-        self.assertTrue(np.array_equal(output, fake_output))
-
-        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
-        # because it doesn't do signature filtering.
-        class FakeEncoder(bart_model.model.encoder.__class__):
-            def call(self, input_ids, **kwargs):
-                return super().call(input_ids, **kwargs)
-
-        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared)
-        bart_model.model.encoder = fake_encoder
-
-        # Normal generation still works (the output will be different because the encoder weights are different)
-        fake_output = bart_model.generate(input_ids).numpy()
-        with self.assertRaises(ValueError):
-            # FakeEncoder.call() accepts **kwargs -> no filtering -> value error due to unexpected input "foo"
-            bart_model.generate(input_ids, foo="bar")
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 3b9700dc20c9..9b63e42946a0 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -49,7 +49,6 @@
 from transformers.utils import is_ipex_available
 
 from ..test_modeling_common import floats_tensor, ids_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
 
 
 if is_torch_available():
@@ -2783,24 +2782,9 @@ def test_speculative_sampling_target_distribution(self):
 
 @pytest.mark.generate
 @require_torch
-class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_torch_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": AutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": AutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": AutoModelForVision2Seq,
-            "LogitsProcessorList": LogitsProcessorList,
-            "MinLengthLogitsProcessor": MinLengthLogitsProcessor,
-            "create_tensor_fn": torch.tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "pt",
-        }
-
+class GenerationIntegrationTests(unittest.TestCase):
     @slow
     def test_diverse_beam_search(self):
-        # PT-only test: TF doesn't have a diverse beam search implementation
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
         The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
         "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
@@ -2834,7 +2818,6 @@ def test_diverse_beam_search(self):
         )
 
     def test_max_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -2848,7 +2831,6 @@ def test_max_length_if_input_embeds(self):
         self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
 
     def test_min_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -2862,7 +2844,6 @@ def test_min_length_if_input_embeds(self):
         self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
 
     def test_custom_stopping_criteria_overload_error(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2876,7 +2857,6 @@ def test_custom_stopping_criteria_overload_error(self):
             bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
 
     def test_custom_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2900,7 +2880,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
 
     # TODO (joao): replace `stop_sequence` in the pipeline by the more recent `generate` functionality
     def test_stop_sequence_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         prompt = """Hello I believe in"""
         generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
         output = generator(prompt)
@@ -2913,7 +2892,6 @@ def test_stop_sequence_stopping_criteria(self):
         self.assertEqual(output, [{"generated_text": "Hello I believe in we"}])
 
     def test_generate_non_nlp_input_ids_as_kwarg(self):
-        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
         model = ImageGPTForCausalImageModeling.from_pretrained(
             "hf-internal-testing/tiny-random-imagegpt", max_length=10
         ).to(torch_device)
@@ -2926,7 +2904,6 @@ def test_generate_non_nlp_input_ids_as_kwarg(self):
         self.assertEqual(output_sequences.shape, (3, 10))
 
     def test_generate_input_values_as_encoder_kwarg(self):
-        # PT-only test: AFAIK there's no generate-capable architecture in TF that supports `input_values` as its input
         input_values = floats_tensor((2, 250))
         model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
         model = model.to(torch_device)
@@ -2937,7 +2914,6 @@ def test_generate_input_values_as_encoder_kwarg(self):
         self.assertEqual(output_sequences.shape, (2, 5))
 
     def test_transition_scores_group_beam_search_encoder_decoder(self):
-        # PT-only test: TF doesn't have group beam search
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
             "Michael Phelps is arguably the most decorated Olympian of all time.",
@@ -3067,7 +3043,6 @@ def test_synthid_text_watermark_generation_mean_expected_bias(self):
 
     @slow
     def test_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have a BeamSearchScorer
         # exactly the example provided in the docstrings of beam search, which previously
         # failed after directly copying from it. Refer to PR #15555
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
@@ -3094,7 +3069,6 @@ def test_beam_search_example_integration(self):
 
     @slow
     def test_constrained_beam_search(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3132,7 +3106,6 @@ def test_constrained_beam_search(self):
 
     @slow
     def test_constrained_beam_search_mixed(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3173,7 +3146,6 @@ def test_constrained_beam_search_mixed(self):
 
     @slow
     def test_constrained_beam_search_mixed_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3251,7 +3223,6 @@ def test_cfg_mixin(self):
 
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
@@ -3276,7 +3247,6 @@ def test_constrained_beam_search_example_translation_mixin(self):
 
     @slow
     def test_constrained_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
@@ -3345,7 +3315,6 @@ def test_per_row_stopping_criteria(self):
         self.assertListEqual(out_text, expected_out)
 
     def test_constrained_beam_search_mixin_type_checks(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
         model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
 
@@ -3386,7 +3355,6 @@ def test_constrained_beam_search_mixin_type_checks(self):
             model.generate(input_ids, force_words_ids=[[[-1]]])
 
     def test_batched_decoder_start_id(self):
-        # PT-only test: TF doesn't support batched_decoder_start_id
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
             "Michael Phelps is arguably the most decorated Olympian of all time.",
@@ -3435,7 +3403,6 @@ def test_decoder_start_id_from_config(self):
             outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
 
     def test_contrastive_search_batched(self):
-        # PT-only test: TF doesn't have constrained beam search
         # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
         articles = ["Foo", "Bar Baz"]
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
@@ -3461,7 +3428,6 @@ def test_contrastive_search_batched(self):
         self.assertTrue(max_score_diff < 1e-5)
 
     def test_logits_processor_not_inplace(self):
-        # PT-only test: TF fixes were not made
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3572,7 +3538,6 @@ def test_default_max_length_warning(self):
             self.assertEqual(len(warning_list), 0)
 
     def test_length_warning_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3604,7 +3569,6 @@ def test_default_assisted_generation(self):
         self.assertEqual(config.is_assistant, False)
 
     def test_generated_length_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3639,7 +3603,6 @@ def test_generated_length_assisted_generation(self):
         self.assertTrue(out.shape[-1] <= (input_length + 7))
 
     def test_model_kwarg_assisted_decoding_decoder_only(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         model.generation_config.pad_token_id = tokenizer.eos_token_id
@@ -3839,7 +3802,6 @@ def test_return_unprocessed_logit_scores(self):
     @slow
     @require_torch_multi_gpu
     def test_assisted_decoding_in_different_gpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda:0")
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             "cuda:1"
@@ -3863,7 +3825,6 @@ def test_assisted_decoding_in_different_gpu(self):
     @slow
     @require_torch_accelerator
     def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             torch_device
         )
@@ -3887,7 +3848,6 @@ def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
         self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
 
     def test_special_tokens_fall_back_to_model_default(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             torch_device
         )
@@ -4367,6 +4327,416 @@ def test_max_time(self):
         duration = datetime.datetime.now() - start
         self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
 
+    def test_validate_generation_inputs(self):
+        """Tests validation of inputs to `generate`"""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        encoder_input_str = "Hello world"
+        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        # typos are quickly detected (the correct argument is `do_sample`)
+        with self.assertRaisesRegex(ValueError, "do_samples"):
+            model.generate(input_ids, do_samples=True)
+
+        # arbitrary arguments that will not be used anywhere are also not accepted
+        with self.assertRaisesRegex(ValueError, "foo"):
+            fake_model_kwargs = {"foo": "bar"}
+            model.generate(input_ids, **fake_model_kwargs)
+
+        # however, valid model_kwargs are accepted
+        valid_model_kwargs = {"attention_mask": torch.tensor(np.zeros_like(input_ids))}
+        model.generate(input_ids, **valid_model_kwargs)
+
+    def test_custom_logits_processor(self):
+        """Tests that custom logits processors can be used in `generate`, and that redundant arguments are caught."""
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids
+
+        logits_processor = LogitsProcessorList()
+        logits_processor.append(MinLengthLogitsProcessor(min_length=10, eos_token_id=0))
+
+        # it should not be allowed to both define `min_length` via config and `logits_processor` list
+        with self.assertRaises(ValueError):
+            bart_model.generate(input_ids, logits_processor=logits_processor, min_length=10)
+        bart_model.generate(input_ids, logits_processor=logits_processor)
+
+    def test_transition_scores_greedy_search(self):
+        """Test that `compute_transition_scores` is working as expected with gready search"""
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+        model.generation_config.eos_token_id = None
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
+        transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
+                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_greedy_search_normalized(self):
+        """
+        Test that `compute_transition_scores` is working as expected with gready search, with `normalize_logits=True`
+        """
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+        model.generation_config.eos_token_id = None
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
+        transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
+                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and encoder-decoder models
+        """
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and encoder-decoder models, when
+        an EOS token is defined
+        """
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_decoder_only(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and decoder-only models
+        """
+        articles = [
+            "Justin Timberlake",
+            "Michael Phelps",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    @slow
+    def test_transition_scores_early_stopping(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and early stopping
+
+        This is an aggressive test that makes sure that `beam_search's`
+        transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
+        2 x input_ids for "question: How are you? \n context: I had a long day, "
+        """
+        input_ids = torch.tensor(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids,
+            max_length=10,
+            return_dict_in_generate=True,
+            output_scores=True,
+            forced_eos_token_id=model.config.eos_token_id,
+            num_beams=4,
+            do_sample=False,
+            num_return_sequences=3,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(
+            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
+        )
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
+
+    def test_encoder_decoder_generate_attention_mask(self):
+        """
+        Test that `generate` automagically creates the correct `attention_mask` for encoder-decoder models (which
+        has a different keyword)
+        """
+        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        # need extreme generation values here to force this test
+        # to fail when `attention_mask` is not correctly treated in generate
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+        )
+        model.config.eos_token_id = None
+        input_ids = tokenizer(articles[0], return_tensors="pt").input_ids
+        input_ids_batched = tokenizer(articles, padding=True, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+        input_ids_batched = input_ids_batched.to(torch_device)
+
+        generate_kwargs = {
+            "return_dict_in_generate": True,
+            "output_scores": True,
+            "max_length": 50,
+            "num_beams": 5,
+            "num_return_sequences": 5,
+        }
+
+        output_sequences_batched = model.generate(input_ids=input_ids_batched, **generate_kwargs)
+        output_sequences = model.generate(input_ids=input_ids, **generate_kwargs)
+
+        batched_out = output_sequences_batched.sequences_scores
+        out = output_sequences.sequences_scores
+        batched_out = batched_out.cpu().numpy()
+        out = out.cpu().numpy()
+
+        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
+        self.assertTrue(diff < 1e-4)
+
+    def test_generate_input_ids_as_kwarg(self):
+        """Test that `input_ids` work equaly as a positional and keyword argument in decoder-only models"""
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids)
+        output_sequences = model.generate(input_ids)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 15))
+
+    def test_generate_input_ids_as_encoder_kwarg(self):
+        """Test that `input_ids` work equaly as a positional and keyword argument in encoder-decoder models"""
+        article = "Justin Timberlake and Jessica Biel, welcome to parenthood."
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model.config.eos_token_id = None
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids, max_length=5)
+        output_sequences = model.generate(input_ids, max_length=5)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 5))
+
+    def test_generate_inputs_and_encoder_kwargs(self):
+        """
+        Test that an exception is thrown if the main tensor (`input_ids` in LLMs) is passed as both a positional and
+        keyword argument
+        """
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids, input_ids=input_ids)
+
+    def test_generate_too_many_encoder_kwargs(self):
+        """Test that passing redundant inputs results in an exception (`input_ids` and `inputs_embeds` in LLMs)"""
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
+
+    def test_generate_input_features_as_encoder_kwarg(self):
+        """Test that non-`input_ids` main model inputs are correctly handled as positional arguments"""
+        input_features = floats_tensor((3, 80, 60))
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+        )
+        input_features.to(torch_device)
+        model = model.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
+        output_sequences = model.generate(input_features, max_length=5)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (3, 5))
+
+    def test_generate_encoder_outputs_attention_mask(self):
+        """Test that `generate` can handle attention masks when the encoder outputs are passed"""
+        input_features = floats_tensor((3, 80, 60))
+        attention_mask = torch.randint(0, 2, input_features.shape).to(torch_device)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+        )
+        input_features = input_features.to(torch_device)
+        attention_mask = attention_mask.to(torch_device)
+        model = model.to(torch_device)
+
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(input_features)
+
+        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
+        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
+        output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
+        output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
+
+        self.assertFalse(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
+
+    def test_eos_token_id_int_and_list_greedy_search(self):
+        """Test that `generate` can handle multiple EOS tokens"""
+        generation_kwargs = {
+            "do_sample": False,
+            "num_beams": 1,
+        }
+        expectation = 13
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors="pt")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = model.to(torch_device)
+        tokens = tokens.to(torch_device)
+
+        eos_token_id = 873
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+        eos_token_id = [873, 198]
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+    def test_generate_vision2text_conditioning(self):
+        """Test that `decoder_input_ids` can be used to condition the generation in vision-to-text models"""
+        pixel_values = floats_tensor((2, 3, 30, 30))
+        conditioning_input = torch.tensor([[10], [10]])  # this should be the 2nd output token, after the BOS token
+        model = AutoModelForVision2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2"
+        )
+        pixel_values = pixel_values.to(torch_device)
+        model = model.to(torch_device)
+        conditioning_input = conditioning_input.to(torch_device)
+
+        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
+        # decoder_input_ids, if the encoder is not a model with text input)
+        output_sequences_decoder_input_ids = model.generate(
+            pixel_values, max_length=5, decoder_input_ids=conditioning_input
+        )
+        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
+        output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
+        output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
+        conditioning_input = conditioning_input.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):
diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py
index 87603ce127b3..df2c689133e8 100644
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ b/tests/models/bart/test_modeling_flax_bart.py
@@ -19,7 +19,6 @@
 from transformers import BartConfig, BartTokenizer, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
@@ -324,7 +323,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
index d5d9c3d7cfb7..09af037e9ca2 100644
--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
@@ -20,7 +20,6 @@
 from transformers import BlenderbotConfig, is_flax_available
 from transformers.testing_utils import jax_device, require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -309,7 +308,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
index f09b7f05ce3a..7818dd67c7c3 100644
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@@ -20,7 +20,6 @@
 from transformers import BlenderbotSmallConfig, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -308,7 +307,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
index dffee6793652..60b865a45fe2 100644
--- a/tests/models/bloom/test_modeling_flax_bloom.py
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@@ -18,7 +18,6 @@
 from transformers import BloomConfig, BloomTokenizerFast, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -169,7 +168,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/gemma/test_modeling_flax_gemma.py b/tests/models/gemma/test_modeling_flax_gemma.py
index 3a56cbfb6b54..e8582268fee9 100644
--- a/tests/models/gemma/test_modeling_flax_gemma.py
+++ b/tests/models/gemma/test_modeling_flax_gemma.py
@@ -18,7 +18,6 @@
 from transformers import AutoTokenizer, GemmaConfig, is_flax_available
 from transformers.testing_utils import require_flax, require_read_token, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -174,7 +173,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGemmaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGemmaModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGemmaModel, FlaxGemmaForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index 2e98930e4c69..3b80cd52bdef 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -22,7 +22,6 @@
 from transformers import GPT2Config, GPT2Tokenizer, is_flax_available, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
@@ -209,7 +208,7 @@ def check_bool_attention_mask_in_generation(self, model_class_name, config, inpu
 
 
 @require_flax
-class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPT2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index 490d58c8d112..6875a46299fc 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -22,7 +22,6 @@
 from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -181,7 +180,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPTNeoModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index ece207ee5dbb..09f2aa99d7ea 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -22,7 +22,6 @@
 from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_flax, tooslow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -178,7 +177,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPTJModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index da326e797d61..e4d9418bec1c 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -20,7 +20,6 @@
 from transformers import LlamaConfig, is_flax_available, is_tokenizers_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -174,7 +173,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxLlamaModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index f779ceefc5bd..fa8673ec439a 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -28,7 +28,6 @@
     slow,
 )
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
@@ -235,7 +234,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_flax
-class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxLongT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
     is_encoder_decoder = True
 
diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py
index 9f15291754de..4353bd173235 100644
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ b/tests/models/marian/test_modeling_flax_marian.py
@@ -21,7 +21,6 @@
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
 from transformers.utils import cached_property
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -228,7 +227,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (FlaxMarianModel, FlaxMarianMTModel) if is_flax_available() else ()
 
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
index 6e0230646051..bacecec8571e 100644
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@@ -21,7 +21,6 @@
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
 from transformers.utils import cached_property
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -330,7 +329,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/mistral/test_modeling_flax_mistral.py b/tests/models/mistral/test_modeling_flax_mistral.py
index c78a402fa90c..14c91be258f9 100644
--- a/tests/models/mistral/test_modeling_flax_mistral.py
+++ b/tests/models/mistral/test_modeling_flax_mistral.py
@@ -20,7 +20,6 @@
 from transformers import MistralConfig, is_flax_available, is_tokenizers_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -185,7 +184,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxMistralModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxMistralModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxMistralModel, FlaxMistralForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/mistral/test_modeling_tf_mistral.py b/tests/models/mistral/test_modeling_tf_mistral.py
index 448b40fc44c8..dd4eff6ba908 100644
--- a/tests/models/mistral/test_modeling_tf_mistral.py
+++ b/tests/models/mistral/test_modeling_tf_mistral.py
@@ -24,7 +24,6 @@
     slow,
 )
 
-from ...generation.test_tf_utils import TFGenerationIntegrationTests
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -244,7 +243,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFMistralModelTest(TFModelTesterMixin, TFGenerationIntegrationTests, PipelineTesterMixin, unittest.TestCase):
+class TFMistralModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFMistralModel, TFMistralForCausalLM, TFMistralForSequenceClassification) if is_tf_available() else ()
     )
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
index c5c3d10f23c5..0b943de559da 100644
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ b/tests/models/opt/test_modeling_flax_opt.py
@@ -19,7 +19,6 @@
 from transformers import OPTConfig, is_flax_available
 from transformers.testing_utils import require_flax, require_sentencepiece, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -203,7 +202,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxOPTModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxOPTModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxOPTModel, FlaxOPTForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index ef9a2b33bc24..a07f32350d95 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -416,82 +416,6 @@ def test_resize_embeddings_untied(self):
     def test_generate_without_input_ids(self):
         pass
 
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_features
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_features, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_features, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_features,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(input_features, do_sample=False, num_beams=2, num_return_sequences=2)
-            )
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
     # overwritten from parent -- the input is `input_features`, not `input_ids`
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index 516fb5c6d528..963bf91716d1 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -27,7 +27,6 @@
     slow,
 )
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
@@ -227,7 +226,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_flax
-class FlaxT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else ()
     is_encoder_decoder = True
 
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 7aacf5171921..6e47a1b40b64 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -524,127 +524,6 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
             )
 
-    def test_generate_without_input_ids(self):
-        pass
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_features
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_features, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_features, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_features,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(input_features, do_sample=False, num_beams=2, num_return_sequences=2)
-            )
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_generate_with_prompt_ids_and_task_and_language(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = TFWhisperForConditionalGeneration(config)
-        input_features = input_dict["input_features"]
-        prompt_ids = np.arange(5)
-        language = "<|de|>"
-        task = "translate"
-        lang_id = 6
-        task_id = 7
-        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
-        model.generation_config.__setattr__("task_to_id", {task: task_id})
-
-        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            lang_id,
-            task_id,
-        ]
-        for row in output.numpy().tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
-    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = TFWhisperForConditionalGeneration(config)
-        input_features = input_dict["input_features"]
-        prompt_ids = np.asarray(range(5))
-        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
-
-        output = model.generate(
-            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
-        )
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            *[token for _rank, token in forced_decoder_ids],
-        ]
-        for row in output.numpy().tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
 
 def _load_datasamples(num_samples):
     ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index b34aee8f3fa8..8dcdb8ae0731 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -21,7 +21,6 @@
 from transformers import XGLMConfig, XGLMTokenizer, is_flax_available, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_flax, require_sentencepiece, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -181,7 +180,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_sentencepiece
 @require_flax
-class FlaxXGLMModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxXGLMModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 9dc712ab67b6..309b1976b500 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -70,16 +70,6 @@
         TFAutoModelForSequenceClassification,
         TFSharedEmbeddings,
     )
-    from transformers.generation import (
-        TFBeamSampleDecoderOnlyOutput,
-        TFBeamSampleEncoderDecoderOutput,
-        TFBeamSearchDecoderOnlyOutput,
-        TFBeamSearchEncoderDecoderOutput,
-        TFGreedySearchDecoderOnlyOutput,
-        TFGreedySearchEncoderDecoderOutput,
-        TFSampleDecoderOnlyOutput,
-        TFSampleEncoderDecoderOutput,
-    )
     from transformers.modeling_tf_utils import keras
 
     tf.config.experimental.enable_tensor_float_32_execution(False)
@@ -1211,150 +1201,6 @@ def test_embeddings_out_of_bounds_raise_exception(self):
             with self.assertRaises(tf.errors.InvalidArgumentError):
                 model(**prepared_inputs)
 
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids
-                with self.assertRaises(ValueError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True))
-            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
-                # Models with non-text inputs won't work here; num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_ids, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_greedy = model.generate(
-                input_ids,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_sample = model.generate(
-                input_ids,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
-
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
-            else:
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_beam_search = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_beam_sample = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
-
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -1574,40 +1420,6 @@ def test_int_support(self):
                 if tensor_spec.dtype.is_integer:
                     self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
 
-    def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-
-            head_masking = {
-                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
-                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-            }
-
-            signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    inputs_dict["input_ids"],
-                    num_beams=1,
-                    max_length=inputs_dict["input_ids"] + 5,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
-
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
@@ -1717,133 +1529,6 @@ def test_dataset_conversion(self):
                 model.compile(optimizer="sgd", run_eagerly=True)
                 model.train_on_batch(test_batch, test_batch_labels)
 
-    def _test_xla_generate(self, **generate_kwargs):
-        def _generate_and_check_results(model, inputs, is_input_ids):
-            # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
-            if is_input_ids:
-                if model.generation_config.pad_token_id is not None:
-                    if config.pad_token_id == 0:
-                        new_pad_token = model.generation_config.pad_token_id + 1
-                    else:
-                        new_pad_token = model.generation_config.pad_token_id - 1
-                else:
-                    new_pad_token = None
-                inputs = tf.where(inputs != model.generation_config.pad_token_id, inputs, new_pad_token)
-
-            generated = model.generate(inputs, **generate_kwargs).numpy()
-            generate_xla = tf.function(model.generate, jit_compile=True)
-            generated_xla = generate_xla(inputs, **generate_kwargs).numpy()
-
-            # Due to numerical instability, let's fail the test only if there are more than 10% of input sequences give
-            # different outputs between XLA and non-XLA versions. If there are less than 10 examples, let's be strict
-            # and not allow any difference.
-            diff = [[], []]
-            for _generated, _generated_xla in zip(generated.tolist(), generated_xla.tolist()):
-                if _generated != _generated_xla:
-                    diff[0].append(_generated)
-                    diff[1].append(_generated_xla)
-            ratio = len(diff[0]) / len(generated)
-            if ratio > 0.1 or (len(diff[0]) > 0 and len(generated) < 10):
-                self.assertListEqual(diff[0], diff[1])
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.eos_token_id = None  # Generate until max length
-            config.do_sample = False
-
-            # extract the input to the model
-            is_input_ids = "input_ids" in inputs_dict
-            is_input_features = "input_features" in inputs_dict
-            if not (is_input_ids or is_input_features):
-                raise ValueError("No valid generate input found in inputs_dict")
-            inputs = inputs_dict["input_ids"] if is_input_ids else inputs_dict["input_features"]
-
-            # fix config for models with additional sequence-length limiting settings
-            seq_len = inputs.get_shape()[1]
-            for var_name in ["max_position_embeddings", "max_target_positions"]:
-                attr = getattr(config, var_name, None)
-                if attr is not None and attr < seq_len + generate_kwargs["max_new_tokens"]:
-                    try:
-                        setattr(config, var_name, seq_len + generate_kwargs["max_new_tokens"])
-                    except NotImplementedError:
-                        # xlnet will raise an exception when trying to set
-                        # max_position_embeddings.
-                        pass
-
-            model = model_class(config)
-
-            if model.supports_xla_generation:
-                _generate_and_check_results(model, inputs, is_input_ids)
-            else:
-                with self.assertRaises(ValueError):
-                    _generate_and_check_results(model, inputs, is_input_ids)
-
-    def test_xla_generate_fast(self):
-        """
-        Basic quick test for generate-compatible classes that confirms that XLA-generated tokens are the same as their
-        non XLA counterparts.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=3)
-
-    @slow
-    def test_xla_generate_contrastive(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` for contrastive search -- contrastive search directly
-        manipulates the model cache and other outputs, and this test ensures that they are in a valid format that is
-        also supported by XLA.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=16, penalty_alpha=0.5, top_k=4)
-
-    @slow
-    def test_xla_generate_slow(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` -- this test asks for several long sequences using
-        beam search, with and without XLA. The two outputs should match, and a failure in this test indicates that the
-        model may need further analysis if it is to be used for XLA generation.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=8, num_return_sequences=2, max_new_tokens=128)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
-
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     """Creates a random int32 tensor of the shape within the vocab size."""

From 77aa9fc0767439fab61108d960cec8ec37c6723e Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 17 Feb 2025 16:42:28 +0100
Subject: [PATCH 08/14] [generate] Fix encoder decoder models attention mask
 (#36018)

---
 src/transformers/generation/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 9760b37dea3c..3c7445284d9a 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -420,6 +420,7 @@ def prepare_inputs_for_generation(
             model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
 
         # 4. Create missing `position_ids` on the fly
+        encoder_attention_mask = attention_mask if self.config.is_encoder_decoder else None
         attention_mask = (
             kwargs.pop("decoder_attention_mask", None) if self.config.is_encoder_decoder else attention_mask
         )
@@ -490,6 +491,9 @@ def prepare_inputs_for_generation(
         if attention_mask is not None:
             model_inputs[attention_mask_key] = attention_mask
 
+        if encoder_attention_mask is not None:
+            model_inputs["attention_mask"] = encoder_attention_mask
+
         # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
         for key, value in kwargs.items():
             if key not in model_inputs:

From 3e970dbbf12392a9230d15f8715d23e987313abb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Feb 2025 16:28:43 +0000
Subject: [PATCH 09/14] Bump transformers from 4.38.0 to 4.48.0 in
 /examples/research_projects/codeparrot/examples (#36237)

Bump transformers in /examples/research_projects/codeparrot/examples

Bumps [transformers](https://github.com/huggingface/transformers) from 4.38.0 to 4.48.0.
- [Release notes](https://github.com/huggingface/transformers/releases)
- [Commits](https://github.com/huggingface/transformers/compare/v4.38.0...v4.48.0)

---
updated-dependencies:
- dependency-name: transformers
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/codeparrot/examples/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/examples/requirements.txt b/examples/research_projects/codeparrot/examples/requirements.txt
index 64ee5b508f77..c5e21ab98192 100644
--- a/examples/research_projects/codeparrot/examples/requirements.txt
+++ b/examples/research_projects/codeparrot/examples/requirements.txt
@@ -1,5 +1,5 @@
 datasets==2.3.2
-transformers==4.38.0
+transformers==4.48.0
 wandb==0.13.1
 evaluate==0.2.2
 scikit-learn==1.5.0
\ No newline at end of file

From dae8708c36b31d401752937b6ff1b4ae1424ba68 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 17 Feb 2025 17:48:57 +0100
Subject: [PATCH 10/14] Add compressed tensor in quant dockerfile (#36239)

add compressed_tensors in the dockerfile
---
 docker/transformers-quantization-latest-gpu/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 700df877d10f..3887f37b34b3 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -76,6 +76,9 @@ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
 RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
 RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
 
+# Add compressed-tensors for quantization testing
+RUN python3 -m pip install --no-cache-dir compressed-tensors
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop

From 429f1a682dfa2b4b5672f8c703e8c607857a18db Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 17 Feb 2025 16:52:44 +0000
Subject: [PATCH 11/14] [tests] remove `test_export_to_onnx` (#36241)

---
 tests/models/fsmt/test_modeling_fsmt.py           | 14 --------------
 tests/models/longt5/test_modeling_longt5.py       | 14 --------------
 tests/models/mt5/test_modeling_mt5.py             | 14 --------------
 tests/models/pop2piano/test_modeling_pop2piano.py | 15 ---------------
 .../test_modeling_switch_transformers.py          | 14 --------------
 tests/models/t5/test_modeling_t5.py               | 14 --------------
 tests/models/umt5/test_modeling_umt5.py           | 14 --------------
 7 files changed, 99 deletions(-)

diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
index 95789c844aec..f9bec05743ca 100644
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@@ -262,20 +262,6 @@ def test_save_load_missing_keys(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        model = FSMTModel(config).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (inputs_dict["input_ids"], inputs_dict["attention_mask"]),
-                f"{tmpdirname}/fsmt_test.onnx",
-                export_params=True,
-                opset_version=12,
-                input_names=["input_ids", "attention_mask"],
-            )
-
     def test_ensure_weights_are_shared(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
 
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 38b159679f04..6ec347fe055d 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -627,20 +627,6 @@ def test_model_from_pretrained(self):
         model = LongT5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @slow
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = LongT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/longt5_test.onnx",
-                export_params=True,
-                opset_version=14,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 994d88444809..185bd149175e 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -871,20 +871,6 @@ def test_model_from_pretrained(self):
         model = MT5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = MT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 50d25aaf5cca..e1f52770c044 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -26,7 +26,6 @@
 from transformers.testing_utils import (
     require_essentia,
     require_librosa,
-    require_onnx,
     require_scipy,
     require_torch,
     slow,
@@ -611,20 +610,6 @@ def test_model_from_pretrained(self):
         model = Pop2PianoForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @require_onnx
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = Pop2PianoForConditionalGeneration(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/Pop2Piano_test.onnx",
-                export_params=True,
-                opset_version=14,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_pass_with_input_features(self):
         input_features = BatchFeature(
             {
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 03de2f72d01d..cb62d364c158 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -709,20 +709,6 @@ def test_model_from_pretrained(self):
         model = SwitchTransformersModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = SwitchTransformersModel(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/switch_transformers_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index a0439550f8f0..03a6adb1a916 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -875,20 +875,6 @@ def test_model_from_pretrained(self):
         model = T5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = T5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index e9a5d7e64221..c274efcf938d 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -525,20 +525,6 @@ def test_with_sequence_classification_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = UMT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()

From 626666c444208557041ef7edcda6b9e78eddfdee Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Feb 2025 18:30:07 +0100
Subject: [PATCH 12/14] Au revoir flaky `test_fast_is_faster_than_slow`
 (#36240)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../rt_detr/test_image_processing_rt_detr.py      | 15 ++++++++++++++-
 tests/test_image_processing_common.py             | 11 ++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py
index e27c1838f940..41e26e2a1328 100644
--- a/tests/models/rt_detr/test_image_processing_rt_detr.py
+++ b/tests/models/rt_detr/test_image_processing_rt_detr.py
@@ -16,7 +16,14 @@
 
 import requests
 
-from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow
+from transformers.testing_utils import (
+    is_flaky,
+    require_torch,
+    require_torch_gpu,
+    require_torchvision,
+    require_vision,
+    slow,
+)
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -427,3 +434,9 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
         )
         # verify size
         torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))
+
+    @is_flaky(
+        description="Still flaky with a failing ratio of ~0.6% after #36240",
+    )
+    def test_fast_is_faster_than_slow(self):
+        super().test_fast_is_faster_than_slow()
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 564e3c15041f..cd11c4ac01f0 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -223,9 +223,14 @@ def measure_time(image_processor, image):
             # Warmup
             for _ in range(5):
                 _ = image_processor(image, return_tensors="pt")
-            start = time.time()
-            _ = image_processor(image, return_tensors="pt")
-            return time.time() - start
+            all_times = []
+            for _ in range(10):
+                start = time.time()
+                _ = image_processor(image, return_tensors="pt")
+                all_times.append(time.time() - start)
+            # Take the average of the fastest 3 runs
+            avg_time = sum(sorted(all_times[:3])) / 3.0
+            return avg_time
 
         dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
         image_processor_slow = self.image_processing_class(**self.image_processor_dict)

From fdcfdbfd221a5b35694db6fb8620eaa729a01f57 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 18 Feb 2025 05:05:42 -0500
Subject: [PATCH 13/14] Fix TorchAoConfig not JSON serializable (#36206)

**Summary:** TorchAoConfig optionally contains a
`torchao.dtypes.Layout` object which is a dataclass and not
JSON serializable, and so the following fails:

```
import json
from torchao.dtypes import TensorCoreTiledLayout
from transformers import TorchAoConfig

config = TorchAoConfig("int4_weight_only", layout=TensorCoreTiledLayout())

config.to_json_string()

json.dumps(config.to_dict())
```

This also causes `quantized_model.save_pretrained(...)` to
fail because the first step of this call is to JSON serialize
the config. Fixes https://github.com/pytorch/ao/issues/1704.

**Test Plan:**
python tests/quantization/torchao_integration/test_torchao.py -k test_json_serializable

Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/utils/quantization_config.py  | 16 ++++++++++++++++
 .../torchao_integration/test_torchao.py        | 18 +++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index ec8a5ef70d4f..3fafca29b9c3 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import dataclasses
 import importlib.metadata
 import json
 import os
@@ -1539,6 +1540,21 @@ def __repr__(self):
         config_dict = self.to_dict()
         return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
 
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary, converting any `torchao.dtypes.Layout`
+        dataclasses to simple dicts.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        d = super().to_dict()
+        if "quant_type_kwargs" in d and "layout" in d["quant_type_kwargs"]:
+            layout = d["quant_type_kwargs"]["layout"]
+            layout = dataclasses.asdict(layout)
+            d["quant_type_kwargs"]["layout"] = layout
+        return d
+
 
 @dataclass
 class BitNetConfig(QuantizationConfigMixin):
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
index d0263f45f180..1708550cf02b 100644
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -31,8 +31,10 @@
     import torch
 
 if is_torchao_available():
-    from torchao.dtypes import AffineQuantizedTensor
-    from torchao.dtypes.affine_quantized_tensor import TensorCoreTiledLayoutType
+    from torchao.dtypes import (
+        AffineQuantizedTensor,
+        TensorCoreTiledLayout,
+    )
 
 
 def check_torchao_quantized(test_module, qlayer, batch_size=1, context_size=1024):
@@ -40,7 +42,7 @@ def check_torchao_quantized(test_module, qlayer, batch_size=1, context_size=1024
     test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
     test_module.assertEqual(weight.quant_min, 0)
     test_module.assertEqual(weight.quant_max, 15)
-    test_module.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType))
+    test_module.assertTrue(isinstance(weight.layout, TensorCoreTiledLayout))
 
 
 def check_forward(test_module, model, batch_size=1, context_size=1024):
@@ -82,6 +84,16 @@ def test_repr(self):
         quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
         repr(quantization_config)
 
+    def test_json_serializable(self):
+        """
+        Check that the config dict can be JSON serialized.
+        """
+        quantization_config = TorchAoConfig("int4_weight_only", group_size=32, layout=TensorCoreTiledLayout())
+        d = quantization_config.to_dict()
+        self.assertIsInstance(d["quant_type_kwargs"]["layout"], dict)
+        self.assertTrue("inner_k_tiles" in d["quant_type_kwargs"]["layout"])
+        quantization_config.to_json_string(use_diff=False)
+
 
 @require_torch_gpu
 @require_torchao

From e6cc410d5b830e280cdc5097cb6ce6ea6a943e5e Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Tue, 18 Feb 2025 11:41:07 +0100
Subject: [PATCH 14/14] Remove flakiness in VLMs  (#36242)

* fix

* nit

* no logits processor needed

* two more tests on assisted decoding
---
 tests/generation/test_utils.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 9b63e42946a0..23190ebe8515 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -113,6 +113,10 @@
 from transformers.utils import is_sklearn_available
 
 
+# TODO: raushan remove this when VLMs start accepting input embeds
+VLM_CLASS_NAMES = ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3", "gotocr2", "qwen2vl", "qwen2_5_vl"]
+
+
 class GenerationTesterMixin:
     input_name = "input_ids"
     model_tester = None
@@ -1258,6 +1262,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                     "blip2",  # overridden `generate()`
                     "instructblip",
                     "instructblipvideo",
+                    *VLM_CLASS_NAMES,  # shouldn't suggest image tokens
                 ]
             ):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
@@ -1411,7 +1416,8 @@ def test_assisted_decoding_sample(self):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_assisted = model.generate(**generation_kwargs, **inputs_dict)
+            logits_processor_kwargs = self._get_logits_processor_kwargs(config=model.config)
+            output_assisted = model.generate(**generation_kwargs, **inputs_dict, **logits_processor_kwargs)
 
             self._check_generate_outputs(output_assisted, config, use_cache=True)
 
@@ -1690,8 +1696,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
             #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
             #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3", "gotocr2"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -1699,7 +1704,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
                 inputs_dict.pop("pixel_values_images", None)
             #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
             has_complex_embeds_computation = any(
-                model_name in model_class.__name__.lower() for model_name in ["moshi", "qwen2vl", "qwen2_5_vl"]
+                model_name in model_class.__name__.lower() for model_name in ["moshi"]
             )
             # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate,
             # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input.
@@ -1769,8 +1774,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
             #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -1929,8 +1933,7 @@ def test_generate_continue_from_inputs_embeds(self):
                 self.skipTest(reason="This model doesn't return `past_key_values`")
 
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -2311,11 +2314,14 @@ def test_assisted_decoding_with_logits_to_keep(self):
                 "return_dict_in_generate": True,
                 "output_scores": True,
             }
+            logits_processor_kwargs = self._get_logits_processor_kwargs(config=model.config)
 
             # Setting logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(**generation_kwargs, **inputs_dict, logits_to_keep=0)
+            with_all_logits = model.generate(
+                **generation_kwargs, **inputs_dict, **logits_processor_kwargs, logits_to_keep=0
+            )
             # By default, logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
+            without_all_logits = model.generate(**inputs_dict, **generation_kwargs, **logits_processor_kwargs)
 
             self._check_similar_generate_outputs(with_all_logits, without_all_logits)