From 2de5cb12be7b7b3d5cf1f51880897a40879cb805 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:14:53 +0100
Subject: [PATCH 01/18] Use the Keras set_random_seed in tests (#30504)

Use the Keras set_random_seed to ensure reproducible weight initialization
---
 tests/pipelines/test_pipelines_common.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 7b7301d6d8cd0c..c680b4c634de40 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -541,11 +541,10 @@ def test_load_default_pipelines_pt(self):
     @slow
     @require_tf
     def test_load_default_pipelines_tf(self):
-        import tensorflow as tf
-
+        from transformers.modeling_tf_utils import keras
         from transformers.pipelines import SUPPORTED_TASKS
 
-        set_seed_fn = lambda: tf.random.set_seed(0)  # noqa: E731
+        set_seed_fn = lambda: keras.utils.set_random_seed(0)  # noqa: E731
         for task in SUPPORTED_TASKS.keys():
             if task == "table-question-answering":
                 # test table in seperate test due to more dependencies
@@ -553,7 +552,7 @@ def test_load_default_pipelines_tf(self):
 
             self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
 
-            # clean-up as much as possible GPU memory occupied by PyTorch
+            # clean-up as much as possible GPU memory occupied by TF
             gc.collect()
 
     @slow

From dfa7b580e9863c38c2f0e0dedf0958c2eab9cb48 Mon Sep 17 00:00:00 2001
From: "JB (Don)" <1557853+hackyon@users.noreply.github.com>
Date: Fri, 26 Apr 2024 23:23:44 +0800
Subject: [PATCH 02/18] [`BERT`] Add support for sdpa (#28802)

* Adding SDPA support for BERT

* Using the proper input name for testing model input in inference()

* Adding documentation for SDPA in BERT model page

* Use the stable link for the documentation

* Adding a gate to only call .contiguous() for torch < 2.2.0

* Additions and fixes to the documentation

* Minor updates to documentation

* Adding extra requirements needed for the contiguous() bug

* Adding "Adapted from" in plcae of the "Copied from"

* Add benchmark speedup tables to the documentation

* Minor fixes to the documentation

* Use ClapText as a replacemenet for Bert in the Copied-From

* Some more fixes for the fix-copies references

* Overriding the test_eager_matches_sdpa_generate in bert tests to not load with low_cpu_mem_usage

[test all]

* Undo changes to separate test

* Refactored SDPA self attention code for KV projections

* Change use_sdpa to attn_implementation

* Fix test_sdpa_can_dispatch_on_flash by preparing input (required for MultipleChoice models)
---
 docs/source/en/model_doc/bert.md              |  47 +++++
 docs/source/en/perf_infer_gpu_one.md          |  12 +-
 src/transformers/modeling_attn_mask_utils.py  |   6 +-
 .../models/align/modeling_align.py            |  11 +-
 .../models/altclip/modeling_altclip.py        |  15 +-
 src/transformers/models/bert/modeling_bert.py | 177 ++++++++++++++++--
 .../modeling_bert_generation.py               |  11 +-
 .../bridgetower/modeling_bridgetower.py       |  11 +-
 .../models/camembert/modeling_camembert.py    |  15 +-
 .../chinese_clip/modeling_chinese_clip.py     |  11 +-
 src/transformers/models/clap/modeling_clap.py |  13 +-
 .../models/data2vec/modeling_data2vec_text.py |  13 +-
 .../models/electra/modeling_electra.py        |  11 +-
 .../models/ernie/modeling_ernie.py            |  13 +-
 src/transformers/models/git/modeling_git.py   |  11 +-
 .../models/layoutlm/modeling_layoutlm.py      |  11 +-
 .../models/markuplm/modeling_markuplm.py      |  13 +-
 .../models/realm/modeling_realm.py            |  11 +-
 .../models/roberta/modeling_roberta.py        |  15 +-
 .../models/roc_bert/modeling_roc_bert.py      |  14 +-
 .../models/splinter/modeling_splinter.py      |  11 +-
 .../xlm_roberta/modeling_xlm_roberta.py       |  15 +-
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |   4 +-
 src/transformers/models/xmod/modeling_xmod.py |   2 +-
 tests/models/bert/test_modeling_bert.py       |  82 +++++++-
 tests/test_modeling_common.py                 |  26 ++-
 26 files changed, 495 insertions(+), 86 deletions(-)

diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md
index c77a1d85252549..b6e99d1031e8f2 100644
--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@@ -61,6 +61,53 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
     
 - The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. The model has to predict if the sentences are consecutive or not.
 
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import BertModel
+
+model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-80GB, CPUx12, RAM 96.6GB, PyTorch 2.2.0, OS Ubuntu 22.04) with `float16`, we saw the 
+following speedups during training and inference.
+
+#### Training
+
+|batch_size|seq_len|Time per batch (eager - s)|Time per batch (sdpa - s)|Speedup (%)|Eager peak mem (MB)|sdpa peak mem (MB)|Mem saving (%)|
+|----------|-------|--------------------------|-------------------------|-----------|-------------------|------------------|--------------|
+|4         |256    |0.023                     |0.017                    |35.472     |939.213            |764.834           |22.800        |
+|4         |512    |0.023                     |0.018                    |23.687     |1970.447           |1227.162          |60.569        |
+|8         |256    |0.023                     |0.018                    |23.491     |1594.295           |1226.114          |30.028        |
+|8         |512    |0.035                     |0.025                    |43.058     |3629.401           |2134.262          |70.054        |
+|16        |256    |0.030                     |0.024                    |25.583     |2874.426           |2134.262          |34.680        |
+|16        |512    |0.064                     |0.044                    |46.223     |6964.659           |3961.013          |75.830        |
+
+#### Inference
+
+|batch_size|seq_len|Per token latency eager (ms)|Per token latency SDPA (ms)|Speedup (%)|Mem eager (MB)|Mem BT (MB)|Mem saved (%)|
+|----------|-------|----------------------------|---------------------------|-----------|--------------|-----------|-------------|
+|1         |128    |5.736                       |4.987                      |15.022     |282.661       |282.924    |-0.093       |
+|1         |256    |5.689                       |4.945                      |15.055     |298.686       |298.948    |-0.088       |
+|2         |128    |6.154                       |4.982                      |23.521     |314.523       |314.785    |-0.083       |
+|2         |256    |6.201                       |4.949                      |25.303     |347.546       |347.033    |0.148        |
+|4         |128    |6.049                       |4.987                      |21.305     |378.895       |379.301    |-0.107       |
+|4         |256    |6.285                       |5.364                      |17.166     |443.209       |444.382    |-0.264       |
+
+
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 494ba660fa763d..64583e4badf044 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -187,10 +187,11 @@ FlashAttention is more memory efficient, meaning you can train on much larger se
 
 ## PyTorch scaled dot product attention
 
-PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available.
+PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. You may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
 
 For now, Transformers supports SDPA inference and training for the following architectures:
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
+* [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
@@ -224,6 +225,13 @@ FlashAttention can only be used for models with the `fp16` or `bf16` torch type,
 
 </Tip>
 
+<Tip>
+
+SDPA does not support certain sets of attention parameters, such as `head_mask` and `output_attentions=True`.
+In that case, you should see a warning message and we will fall back to the (slower) eager implementation.
+
+</Tip>
+
 By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
 
 ```diff
@@ -232,8 +240,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")
-# convert the model to BetterTransformer
-model.to_bettertransformer()
 
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index c69d9555b2afc8..44ea1795669f58 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -445,10 +445,8 @@ def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype,
         or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
     )
 
-    if torch.all(mask == 1):
-        if is_tracing:
-            pass
-        elif tgt_len == 1:
+    if not is_tracing and torch.all(mask == 1):
+        if tgt_len == 1:
             # For query_length == 1, causal attention and bi-directional attention are the same.
             return None
         elif key_value_length == tgt_len:
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 3dce9d383da151..0f8246e8f98c90 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -883,11 +883,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText
+ALIGN_TEXT_SELF_ATTENTION_CLASSES = {
+    "eager": AlignTextSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText,BERT->ALIGN_TEXT
 class AlignTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = AlignTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ALIGN_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = AlignTextSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 0d27d87de7f4f1..ba8abb311a8d2a 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -434,11 +434,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->AltRoberta
+ALT_ROBERTA_SELF_ATTENTION_CLASSES = {
+    "eager": AltRobertaSelfAttention,
+}
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->AltRoberta,ROBERTA->ALT_ROBERTA
 class AltRobertaAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = AltRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ALT_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = AltRobertaSelfOutput(config)
         self.pruned_heads = set()
 
@@ -1205,7 +1212,7 @@ class AltRobertaModel(AltCLIPPreTrainedModel):
 
     config_class = AltCLIPTextConfig
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->AltRoberta
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->AltRoberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -1232,7 +1239,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 262fc79f0d4039..f7af0f1ef5a48c 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -23,10 +23,15 @@
 
 import torch
 import torch.utils.checkpoint
+from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -45,6 +50,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    get_torch_version,
     logging,
     replace_return_docstrings,
 )
@@ -350,6 +356,103 @@ def forward(
         return outputs
 
 
+class BertSdpaSelfAttention(BertSelfAttention):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__(config, position_embedding_type=position_embedding_type)
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+
+    # Adapted from BertSelfAttention
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
+        # mask needs to be such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+        # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            key_layer, value_layer = past_key_value
+        else:
+            key_layer = self.transpose_for_scores(self.key(current_states))
+            value_layer = self.transpose_for_scores(self.value(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal
+        # mask in case tgt_len == 1.
+        is_causal = self.is_decoder and attention_mask is None and tgt_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+        outputs = (attn_output,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
 class BertSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -364,10 +467,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+BERT_SELF_ATTENTION_CLASSES = {
+    "eager": BertSelfAttention,
+    "sdpa": BertSdpaSelfAttention,
+}
+
+
 class BertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = BERT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = BertSelfOutput(config)
         self.pruned_heads = set()
 
@@ -715,6 +826,7 @@ class BertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -859,6 +971,9 @@ def __init__(self, config, add_pooling_layer=True):
 
         self.pooler = BertPooler(config) if add_pooling_layer else None
 
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -945,9 +1060,6 @@ def forward(
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
         if token_type_ids is None:
             if hasattr(self.embeddings, "token_type_ids"):
                 buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
@@ -956,9 +1068,43 @@ def forward(
             else:
                 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+
+        # Expand the attention mask
+        if use_sdpa_attention_masks:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
 
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
@@ -967,7 +1113,15 @@ def forward(
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+
+            if use_sdpa_attention_masks:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
 
@@ -978,13 +1132,6 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index b7250f6f7b926f..73c4d1d1e5da91 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -192,11 +192,18 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration
+BERT_GENERATION_SELF_ATTENTION_CLASSES = {
+    "eager": BertGenerationSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration,BERT->BERT_GENERATION
 class BertGenerationAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = BertGenerationSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = BERT_GENERATION_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = BertGenerationSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index bcace39b299bcf..3fc9f755aab9e2 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -562,11 +562,18 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower
+BRIDGE_TOWER_SELF_ATTENTION_CLASSES = {
+    "eager": BridgeTowerSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower,BERT->BRIDGE_TOWER
 class BridgeTowerAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = BridgeTowerSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = BRIDGE_TOWER_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = BridgeTowerSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 26250896b23d8a..f399fb3f5cfb9b 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -312,11 +312,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert
+CAMEMBERT_SELF_ATTENTION_CLASSES = {
+    "eager": CamembertSelfAttention,
+}
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT
 class CamembertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = CamembertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = CamembertSelfOutput(config)
         self.pruned_heads = set()
 
@@ -745,7 +752,7 @@ class CamembertModel(CamembertPreTrainedModel):
 
     _no_split_modules = []
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Camembert
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -778,7 +785,7 @@ class PreTrainedModel
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index d8e97c20b24cd0..87a1baa217baf1 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -354,11 +354,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ChineseCLIPText
+CHINESE_CLIP_TEXT_SELF_ATTENTION_CLASSES = {
+    "eager": ChineseCLIPTextSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ChineseCLIPText,BERT->CHINESE_CLIP_TEXT
 class ChineseCLIPTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = ChineseCLIPTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = CHINESE_CLIP_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = ChineseCLIPTextSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 7b20b30137d2cb..c21e173133a17f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1376,11 +1376,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText
+CLAP_TEXT_SELF_ATTENTION_CLASSES = {
+    "eager": ClapTextSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText,BERT->CLAP_TEXT
 class ClapTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = CLAP_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = ClapTextSelfOutput(config)
         self.pruned_heads = set()
 
@@ -1763,7 +1770,6 @@ class ClapTextModel(ClapPreTrainedModel):
 
     config_class = ClapTextConfig
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -1782,7 +1788,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 7dcc53e2cc15c8..20e1e1eca5ffab 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -298,11 +298,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText
+DATA2VEC_TEXT_SELF_ATTENTION_CLASSES = {
+    "eager": Data2VecTextSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText,BERT->DATA2VEC_TEXT
 class Data2VecTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = DATA2VEC_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = Data2VecTextSelfOutput(config)
         self.pruned_heads = set()
 
@@ -727,7 +734,7 @@ class PreTrainedModel
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 2138aa97c6dca9..6fbdda2579c1a4 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -355,11 +355,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra
+ELECTRA_SELF_ATTENTION_CLASSES = {
+    "eager": ElectraSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA
 class ElectraAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = ElectraSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ELECTRA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = ElectraSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index a65f453205d5c5..3db6501985604d 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -285,11 +285,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie
+ERNIE_SELF_ATTENTION_CLASSES = {
+    "eager": ErnieSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie,BERT->ERNIE
 class ErnieAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = ErnieSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ERNIE_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = ErnieSelfOutput(config)
         self.pruned_heads = set()
 
@@ -787,7 +794,7 @@ class ErnieModel(ErniePreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Ernie
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Ernie
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index c8953d498428ea..12821609f037bf 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -267,11 +267,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+GIT_SELF_ATTENTION_CLASSES = {
+    "eager": GitSelfAttention,
+}
+
+
 class GitAttention(nn.Module):
-    # Copied from transformers.models.bert.modeling_bert.BertAttention.__init__ with Bert->Git
+    # Copied from transformers.models.bert.modeling_bert.BertAttention.__init__ with Bert->Git,BERT->GIT
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = GitSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = GitSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index c570fdb124adc1..6914f5ee3efb62 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -276,11 +276,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->LayoutLM
+LAYOUTLM_SELF_ATTENTION_CLASSES = {
+    "eager": LayoutLMSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->LayoutLM,BERT->LAYOUTLM
 class LayoutLMAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = LayoutLMSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = LAYOUTLM_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = LayoutLMSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 2058ce27951676..318110daf5d8d1 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -468,11 +468,18 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->MarkupLM
+MARKUPLM_SELF_ATTENTION_CLASSES = {
+    "eager": MarkupLMSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->MarkupLM,BERT->MARKUPLM
 class MarkupLMAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = MarkupLMSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = MARKUPLM_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = MarkupLMSelfOutput(config)
         self.pruned_heads = set()
 
@@ -797,7 +804,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     MARKUPLM_START_DOCSTRING,
 )
 class MarkupLMModel(MarkupLMPreTrainedModel):
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->MarkupLM
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->MarkupLM
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 86f28942893399..adec5647a28134 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -368,11 +368,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm
+REALM_SELF_ATTENTION_CLASSES = {
+    "eager": RealmSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm,BERT->REALM
 class RealmAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = RealmSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = RealmSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index e1f15722e43bdf..640139212081ca 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -294,11 +294,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+ROBERTA_SELF_ATTENTION_CLASSES = {
+    "eager": RobertaSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta,BERT->ROBERTA
 class RobertaAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = RobertaSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = RobertaSelfOutput(config)
         self.pruned_heads = set()
 
@@ -688,7 +695,7 @@ class RobertaModel(RobertaPreTrainedModel):
 
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Roberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -721,7 +728,7 @@ class PreTrainedModel
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 51850c9af1d5c0..739e60b550baf3 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -431,11 +431,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert
+ROC_BERT_SELF_ATTENTION_CLASSES = {
+    "eager": RoCBertSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert,BERT->ROC_BERT
 class RoCBertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = RoCBertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = ROC_BERT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = RoCBertSelfOutput(config)
         self.pruned_heads = set()
 
@@ -759,7 +766,6 @@ def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
         return prediction_scores
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RoCBert,bert->roc_bert
 class RoCBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -880,7 +886,7 @@ class RoCBertModel(RoCBertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RoCBert
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->RoCBert
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index b643601d0ebd49..fa546e1201346a 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -245,11 +245,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Splinter
+SPLINTER_SELF_ATTENTION_CLASSES = {
+    "eager": SplinterSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Splinter,BERT->SPLINTER
 class SplinterAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = SplinterSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = SPLINTER_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = SplinterSelfOutput(config)
         self.pruned_heads = set()
 
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 0d829aaee63582..48c6898811d1e0 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -295,11 +295,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta
+XLM_ROBERTA_SELF_ATTENTION_CLASSES = {
+    "eager": XLMRobertaSelfAttention,
+}
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta,ROBERTA->XLM_ROBERTA
 class XLMRobertaAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = XLMRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.self = XLM_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
         self.output = XLMRobertaSelfOutput(config)
         self.pruned_heads = set()
 
@@ -690,7 +697,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
 
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->XLMRoberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -723,7 +730,7 @@ class PreTrainedModel
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 1c17652dfa0cb4..d8994e335b1242 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -664,7 +664,7 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
     an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->XLMRobertaXL
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -697,7 +697,7 @@ class PreTrainedModel
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index 2bf76a40d46974..32e34ef6683817 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -783,7 +783,7 @@ class XmodModel(XmodPreTrainedModel):
 
     """
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Xmod
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index bdc812ff27657b..ff9a62802048e9 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -18,7 +18,14 @@
 
 from transformers import BertConfig, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import CaptureLogger, require_torch, require_torch_accelerator, slow, torch_device
+from transformers.testing_utils import (
+    CaptureLogger,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -621,6 +628,79 @@ def test_torchscript_device_change(self):
                 loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
                 loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
 
+    # This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True.
+    # TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True.
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        max_new_tokens = 30
+
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_sdpa:
+                self.skipTest(f"{model_class.__name__} does not support SDPA")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+
+                model_sdpa = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    # low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    # low_cpu_mem_usage=True,
+                    attn_implementation="eager",
+                ).to(torch_device)
+
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+                for name, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+                has_sdpa = False
+                for name, submodule in model_sdpa.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        has_sdpa = True
+                        break
+                if not has_sdpa:
+                    raise ValueError("The SDPA model should have SDPA attention layers")
+
+                # Just test that a large cache works as expected
+                res_eager = model_eager.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                res_sdpa = model_sdpa.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                self.assertTrue(torch.allclose(res_eager, res_sdpa))
+
 
 @require_torch
 class BertModelIntegrationTest(unittest.TestCase):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 1c099a4035b440..061c0000ceedfe 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3603,12 +3603,14 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 self.assertTrue(model_eager.config._attn_implementation == "eager")
 
                 for name, submodule in model_eager.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                         raise ValueError("The eager model should not have SDPA attention layers")
 
                 has_sdpa = False
                 for name, submodule in model_sdpa.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                         has_sdpa = True
                         break
                 if not has_sdpa and model_sdpa.config.model_type != "falcon":
@@ -3691,19 +3693,21 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                                         decoder_input_ids = decoder_input_ids.to(torch_device)
 
                                     # TODO: never an `attention_mask` arg here?
-                                    other_inputs = {
+                                    processed_inputs = {
+                                        model.main_input_name: dummy_input,
                                         "decoder_input_ids": decoder_input_ids,
                                         "decoder_attention_mask": dummy_attention_mask,
                                         "output_hidden_states": True,
                                     }
                                 else:
-                                    other_inputs = {
+                                    processed_inputs = {
+                                        model.main_input_name: dummy_input,
                                         "output_hidden_states": True,
                                     }
 
                                     # Otherwise fails for e.g. WhisperEncoderModel
                                     if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                        other_inputs["attention_mask"] = dummy_attention_mask
+                                        processed_inputs["attention_mask"] = dummy_attention_mask
 
                                 # TODO: test gradients as well (& for FA2 as well!)
                                 with torch.no_grad():
@@ -3712,8 +3716,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                                         enable_math=True,
                                         enable_mem_efficient=enable_kernels,
                                     ):
-                                        outputs_eager = model_eager(dummy_input, **other_inputs)
-                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
+                                        prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+                                        outputs_eager = model_eager(**prepared_inputs)
+                                        outputs_sdpa = model_sdpa(**prepared_inputs)
 
                                 logits_eager = (
                                     outputs_eager.hidden_states[-1]
@@ -3799,6 +3804,7 @@ def test_sdpa_can_dispatch_on_flash(self):
                 self.skipTest(f"{model_class.__name__} does not support SDPA")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             if config.model_type in ["llava", "llava_next", "vipllava"]:
                 self.skipTest("Llava-like models currently (transformers==4.39.1) requires an attention_mask input")
             if config.model_type in ["idefics"]:
@@ -3867,12 +3873,14 @@ def test_eager_matches_sdpa_generate(self):
                 self.assertTrue(model_eager.config._attn_implementation == "eager")
 
                 for name, submodule in model_eager.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                         raise ValueError("The eager model should not have SDPA attention layers")
 
                 has_sdpa = False
                 for name, submodule in model_sdpa.named_modules():
-                    if "SdpaAttention" in submodule.__class__.__name__:
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                         has_sdpa = True
                         break
                 if not has_sdpa:

From 77ff304d290d17c9cca4aa2f03330489812e423d Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Fri, 26 Apr 2024 11:52:09 -0400
Subject: [PATCH 03/18] Remove skipping logic now that set_epoch exists
 (#30501)

* Remove skipping logic now that set_epoch exists

* Working version, clean
---
 src/transformers/trainer.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 26ab0877d0c171..1d3c164984ea1c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -96,7 +96,6 @@
     distributed_broadcast_scalars,
     distributed_concat,
     find_batch_size,
-    get_dataloader_sampler,
     get_model_param_count,
     get_module_class_from_name,
     get_parameter_names,
@@ -2137,24 +2136,6 @@ def _inner_training_loop(
 
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
-        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
-        if not args.ignore_data_skip:
-            for epoch in range(epochs_trained):
-                sampler = get_dataloader_sampler(train_dataloader)
-                sampler_kinds = [RandomSampler]
-                if version.parse(accelerate_version) > version.parse("0.23.0"):
-                    sampler_kinds.append(SeedableRandomSampler)
-                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
-                if not is_random_sampler:
-                    # We just need to begin an iteration to create the randomization of the sampler.
-                    for _ in train_dataloader:
-                        break
-                else:
-                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
-                    # AT THE VERY END!
-                    sampler = sampler if sampler is not None else []
-                    _ = list(sampler)
-
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
             epoch_iterator = train_dataloader

From aafa7ce72b65c730788c122a72a974e464409e9a Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:55:24 +0100
Subject: [PATCH 04/18] [`DETR`] Remove timm hardcoded logic in modeling files
 (#29038)

* Enable instantiating model with pretrained backbone weights

* Clarify pretrained import

* Use load_backbone instead

* Add backbone_kwargs to config

* Fix up

* Add tests

* Tidy up

* Enable instantiating model with pretrained backbone weights

* Update tests so backbone checkpoint isn't passed in

* Clarify pretrained import

* Update configs - docs and validation check

* Update src/transformers/utils/backbone_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Clarify exception message

* Update config init in tests

* Add test for when use_timm_backbone=True

* Use load_backbone instead

* Add use_timm_backbone to the model configs

* Add backbone_kwargs to config

* Pass kwargs to constructors

* Draft

* Fix tests

* Add back timm - weight naming

* More tidying up

* Whoops

* Tidy up

* Handle when kwargs are none

* Update tests

* Revert test changes

* Deformable detr test - don't use default

* Don't mutate; correct model attributes

* Add some clarifying comments

* nit - grammar is hard

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 .../configuration_conditional_detr.py         | 14 ++++--
 .../modeling_conditional_detr.py              | 18 ++++---
 .../configuration_deformable_detr.py          | 12 ++++-
 .../modeling_deformable_detr.py               | 49 ++++++++++++-------
 .../models/detr/configuration_detr.py         | 14 +++++-
 src/transformers/models/detr/modeling_detr.py | 16 ++++--
 src/transformers/models/dpt/modeling_dpt.py   |  6 +--
 .../configuration_table_transformer.py        | 14 +++++-
 .../modeling_table_transformer.py             | 14 ++++--
 .../timm_backbone/modeling_timm_backbone.py   |  7 ++-
 .../test_modeling_conditional_detr.py         | 10 ++++
 .../test_modeling_deformable_detr.py          | 11 ++++-
 tests/models/detr/test_modeling_detr.py       | 11 +++++
 .../test_modeling_table_transformer.py        |  8 +++
 14 files changed, 156 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 945e5edb32ad30..4f95de3582f082 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -192,10 +192,16 @@ def __init__(
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
-        if not use_timm_backbone:
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index d8ff371fad77d1..d723d3866ea416 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -338,12 +338,12 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
+# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->ConditionalDetr
 class ConditionalDetrConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
-    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
@@ -352,17 +352,23 @@ def __init__(self, config):
 
         self.config = config
 
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
-            kwargs = {}
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
-                kwargs["output_stride"] = 16
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
             backbone = create_model(
                 config.backbone,
                 pretrained=config.use_pretrained_backbone,
                 features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
+                out_indices=out_indices,
+                in_chans=num_channels,
                 **kwargs,
             )
         else:
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 6d32f6220df586..3f3ffff69ff2e9 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -212,7 +212,16 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -220,6 +229,7 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
         self.num_channels = num_channels
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index c0ac7cffc7ab44..7b2bbb9b1242c9 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -88,11 +88,31 @@ def load_cuda_kernels():
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
+
 if is_accelerate_available():
     from accelerate import PartialState
     from accelerate.utils import reduce
 
 
+if is_timm_available():
+    from timm import create_model
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DeformableDetrConfig"
+_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
+
+DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "sensetime/deformable-detr",
+    # See all Deformable DETR models at https://huggingface.co/models?filter=deformable-detr
+]
+
+
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
     def forward(
@@ -141,21 +161,6 @@ def backward(context, grad_output):
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 
 
-if is_scipy_available():
-    from scipy.optimize import linear_sum_assignment
-
-if is_timm_available():
-    from timm import create_model
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "DeformableDetrConfig"
-_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
-
-
-from ..deprecated._archive_maps import DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST  # noqa: F401, E402
-
-
 @dataclass
 class DeformableDetrDecoderOutput(ModelOutput):
     """
@@ -420,17 +425,23 @@ def __init__(self, config):
 
         self.config = config
 
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
-            kwargs = {}
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
-                kwargs["output_stride"] = 16
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
             backbone = create_model(
                 config.backbone,
                 pretrained=config.use_pretrained_backbone,
                 features_only=True,
-                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
-                in_chans=config.num_channels,
+                out_indices=out_indices,
+                in_chans=num_channels,
                 **kwargs,
             )
         else:
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 9b9b5afacd0b7f..db180ef1d41fed 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -193,7 +193,16 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -201,8 +210,9 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
             # set timm attributes to None
-            dilation, backbone, use_pretrained_backbone = None, None, None
+            dilation = None
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index d7fcdfc5bc7e83..0da702db8b67e2 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -49,9 +49,11 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+
 if is_timm_available():
     from timm import create_model
 
+
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
@@ -345,17 +347,23 @@ def __init__(self, config):
 
         self.config = config
 
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
-            kwargs = {}
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
-                kwargs["output_stride"] = 16
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
             backbone = create_model(
                 config.backbone,
                 pretrained=config.use_pretrained_backbone,
                 features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
+                out_indices=out_indices,
+                in_chans=num_channels,
                 **kwargs,
             )
         else:
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index aad3330279f051..ef6c8bb853abda 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1075,10 +1075,10 @@ def __init__(self, config):
         super().__init__(config)
 
         self.backbone = None
-        if config.backbone_config is not None and config.is_hybrid is False:
-            self.backbone = load_backbone(config)
-        else:
+        if config.is_hybrid or config.backbone_config is None:
             self.dpt = DPTModel(config, add_pooling_layer=False)
+        else:
+            self.backbone = load_backbone(config)
 
         # Neck
         self.neck = DPTNeck(config)
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 9a2ff6bbab3b24..4963396024a57e 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -193,7 +193,16 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -201,8 +210,9 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
             # set timm attributes to None
-            dilation, backbone, use_pretrained_backbone = None, None, None
+            dilation = None
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 8e577a65a5fe00..9a684ee121ddca 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -279,17 +279,23 @@ def __init__(self, config):
 
         self.config = config
 
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
-            kwargs = {}
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
-                kwargs["output_stride"] = 16
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
             backbone = create_model(
                 config.backbone,
                 pretrained=config.use_pretrained_backbone,
                 features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
+                out_indices=out_indices,
+                in_chans=num_channels,
                 **kwargs,
             )
         else:
diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
index 0c6fe67b75731f..e8e0b28e042d6f 100644
--- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py
+++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
@@ -63,12 +63,13 @@ def __init__(self, config, **kwargs):
         # We just take the final layer by default. This matches the default for the transformers models.
         out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
 
+        in_chans = kwargs.pop("in_chans", config.num_channels)
         self._backbone = timm.create_model(
             config.backbone,
             pretrained=pretrained,
             # This is currently not possible for transformer architectures.
             features_only=config.features_only,
-            in_chans=config.num_channels,
+            in_chans=in_chans,
             out_indices=out_indices,
             **kwargs,
         )
@@ -79,7 +80,9 @@ def __init__(self, config, **kwargs):
 
         # These are used to control the output of the model when called. If output_hidden_states is True, then
         # return_layers is modified to include all layers.
-        self._return_layers = self._backbone.return_layers
+        self._return_layers = {
+            layer["module"]: str(layer["index"]) for layer in self._backbone.feature_info.get_dicts()
+        }
         self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
         super()._init_backbone(config)
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index d1152ed8622b9c..c3f77614b4dd31 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -444,7 +444,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
         config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -460,6 +462,14 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            elif model_class.__name__ == "ConditionalDetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 7a83c4f1ed80a8..36be099790a45b 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -521,8 +521,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
-        config.use_timm_backbone = True
         config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -538,6 +539,14 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
+            elif model_class.__name__ == "ConditionalDetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.deformable_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 59b071e031aa8a..27092c626dd46d 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -444,6 +444,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -459,6 +462,14 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            elif model_class.__name__ == "DetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 79da1d191063ab..d323083eb7f1d4 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -456,6 +456,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -471,6 +474,11 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 

From 38b53da38af231b0af967d15ca29c52470e402d5 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 26 Apr 2024 17:06:03 +0100
Subject: [PATCH 05/18] [examples] update whisper fine-tuning (#29938)

* [examples] update whisper fine-tuning

* deprecate forced/suppress tokens

* item assignment

* update readme

* final fix
---
 examples/pytorch/speech-recognition/README.md |  9 ++--
 .../run_speech_recognition_seq2seq.py         | 45 ++++++++++++++-----
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index b9cab9513bd446..4990219f42a143 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -368,6 +368,7 @@ python run_speech_recognition_seq2seq.py \
 	--dataset_name="mozilla-foundation/common_voice_11_0" \
 	--dataset_config_name="hi" \
 	--language="hindi" \
+	--task="transcribe" \
 	--train_split_name="train+validation" \
 	--eval_split_name="test" \
 	--max_steps="5000" \
@@ -384,12 +385,10 @@ python run_speech_recognition_seq2seq.py \
 	--save_steps="1000" \
 	--generation_max_length="225" \
 	--preprocessing_num_workers="16" \
-	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
 	--text_column_name="sentence" \
 	--freeze_feature_encoder="False" \
 	--gradient_checkpointing \
-	--group_by_length \
 	--fp16 \
 	--overwrite_output_dir \
 	--do_train \
@@ -399,7 +398,8 @@ python run_speech_recognition_seq2seq.py \
 ```
 On a single V100, training should take approximately 8 hours, with a final cross-entropy loss of **1e-4** and word error rate of **32.6%**.
 
-If training on a different language, you should be sure to change the `language` argument. The `language` argument should be omitted for English speech recognition.
+If training on a different language, you should be sure to change the `language` argument. The `language` and `task` 
+arguments should be omitted for English speech recognition.
 
 #### Multi GPU Whisper Training
 The following example shows how to fine-tune the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using 2 GPU devices in half-precision:
@@ -410,6 +410,7 @@ torchrun \
 	--dataset_name="mozilla-foundation/common_voice_11_0" \
 	--dataset_config_name="hi" \
 	--language="hindi" \
+	--task="transcribe" \
 	--train_split_name="train+validation" \
 	--eval_split_name="test" \
 	--max_steps="5000" \
@@ -425,12 +426,10 @@ torchrun \
 	--save_steps="1000" \
 	--generation_max_length="225" \
 	--preprocessing_num_workers="16" \
-	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
 	--text_column_name="sentence" \
 	--freeze_feature_encoder="False" \
 	--gradient_checkpointing \
-	--group_by_length \
 	--fp16 \
 	--overwrite_output_dir \
 	--do_train \
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 3a596e2cb7bddd..f352954d80aefa 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -119,17 +119,16 @@ class ModelArguments:
     )
     forced_decoder_ids: List[List[int]] = field(
         default=None,
-        metadata={
+        metadata={"help": "Deprecated. Please use the `language` and `task` arguments instead."},
+    )
+    suppress_tokens: List[int] = field(
+        default=None, metadata={
             "help": (
-                "A list of pairs of integers which indicates a mapping from generation indices to token indices "
-                "that will be forced before sampling. For example, [[0, 123]] means the first generated token "
-                "will always be a token of index 123."
+                "Deprecated. The use of `suppress_tokens` should not be required for the majority of fine-tuning examples."
+                "Should you need to use `suppress_tokens`, please manually update them in the fine-tuning script directly."
             )
         },
     )
-    suppress_tokens: List[int] = field(
-        default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
-    )
     apply_spec_augment: bool = field(
         default=False,
         metadata={
@@ -400,8 +399,6 @@ def main():
         trust_remote_code=model_args.trust_remote_code,
     )
 
-    config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
-
     # SpecAugment for whisper models
     if getattr(config, "model_type", None) == "whisper":
         config.update({"apply_spec_augment": model_args.apply_spec_augment})
@@ -440,9 +437,35 @@ def main():
         model.freeze_encoder()
         model.model.encoder.gradient_checkpointing = False
 
-    if data_args.language is not None:
-        # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
+    if hasattr(model.generation_config, "is_multilingual") and model.generation_config.is_multilingual:
+        # We only need to set the language and task ids in a multilingual setting
         tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
+        model.generation_config.update(
+            **{
+                "language": data_args.language,
+                "task": data_args.task,
+            }
+        )
+    elif data_args.language is not None:
+        raise ValueError(
+            "Setting language token for an English-only checkpoint is not permitted. The language argument should "
+            "only be set for multilingual checkpoints."
+        )
+
+    # TODO (Sanchit): deprecate these arguments in v4.41
+    if model_args.forced_decoder_ids is not None:
+        logger.warning(
+            "The use of `forced_decoder_ids` is deprecated and will be removed in v4.41."
+            "Please use the `language` and `task` arguments instead"
+        )
+        model.generation_config.forced_decoder_ids = model_args.forced_decoder_ids
+
+    if model_args.suppress_tokens is not None:
+        logger.warning(
+            "The use of `suppress_tokens` is deprecated and will be removed in v4.41."
+            "Should you need `suppress_tokens`, please manually set them in the fine-tuning script."
+        )
+        model.generation_config.suppress_tokens = model_args.suppress_tokens
 
     # 6. Resample speech dataset if necessary
     dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate

From e7d52a10d721f4475c810d403b1e71689d4b94b9 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 26 Apr 2024 18:04:41 +0100
Subject: [PATCH 06/18] Fix GroundingDINO, DPR after BERT SDPA update (#30506)

Fix GroundingDINO, DPR after BET SDPA update
---
 docs/source/en/perf_infer_gpu_one.md                          | 1 +
 src/transformers/models/dpr/modeling_dpr.py                   | 2 ++
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 +++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 64583e4badf044..de49d4427b5687 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -194,6 +194,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
+* [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 0a45ec75207c29..928f2b93118ac3 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -142,6 +142,8 @@ class DPRReaderOutput(ModelOutput):
 
 
 class DPRPreTrainedModel(PreTrainedModel):
+    _supports_sdpa = True
+
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 83009c92504211..da8dd29a5cb54d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2113,7 +2113,9 @@ def __init__(self, config: GroundingDinoConfig):
             )
 
         # Create text backbone
-        self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False)
+        self.text_backbone = AutoModel.from_config(
+            config.text_config, add_pooling_layer=False, attn_implementation=config._attn_implementation
+        )
         self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:

From c793b26f2ec9d72970457a1d78372d13bd7b8207 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 26 Apr 2024 18:21:47 +0100
Subject: [PATCH 07/18] load_image - decode b64encode and encodebytes strings
 (#30192)

* Decode b64encode and encodebytes strings

* Remove conditional encode -- image is always a string
---
 src/transformers/image_utils.py |  2 +-
 tests/utils/test_image_utils.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e4a55b3455a344..7d71fc982b4390 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -320,7 +320,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
 
             # Try to load as base64
             try:
-                b64 = base64.b64decode(image, validate=True)
+                b64 = base64.decodebytes(image.encode())
                 image = PIL.Image.open(BytesIO(b64))
             except Exception as e:
                 raise ValueError(
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index d6bc9a37585899..f360c4bb82532d 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import codecs
 import os
 import tempfile
 import unittest
@@ -544,6 +545,23 @@ def test_load_img_base64(self):
 
         self.assertEqual(img_arr.shape, (64, 32, 3))
 
+    def test_load_img_base64_encoded_bytes(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/datasets/hf-internal-testing/dummy-base64-images/raw/main/image_2.txt", f
+                )
+
+            with codecs.open(tmp_file, encoding="unicode_escape") as b64:
+                img = load_image(b64.read())
+                img_arr = np.array(img)
+
+        finally:
+            os.remove(tmp_file)
+
+        self.assertEqual(img_arr.shape, (256, 256, 3))
+
     def test_load_img_rgba(self):
         # we use revision="refs/pr/1" until the PR is merged
         # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1

From 6d4cabda2614d86357092585b416c4d08be73382 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 26 Apr 2024 20:40:12 +0200
Subject: [PATCH 08/18] [SegGPT] Fix seggpt image processor (#29550)

* Fixed SegGptImageProcessor to handle 2D and 3D prompt mask inputs

* Added new test to check prompt mask equivalence

* New proposal

* Better proposal

* Removed unnecessary method

* Updated seggpt docs

* Introduced do_convert_rgb

* nits
---
 docs/source/en/model_doc/seggpt.md            |   5 +-
 .../models/seggpt/image_processing_seggpt.py  | 103 ++++++++----------
 .../seggpt/test_image_processing_seggpt.py    |  83 +++++++++++++-
 tests/models/seggpt/test_modeling_seggpt.py   |  22 +++-
 4 files changed, 148 insertions(+), 65 deletions(-)

diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md
index f821fc14a08c54..5a68d38fc98b6c 100644
--- a/docs/source/en/model_doc/seggpt.md
+++ b/docs/source/en/model_doc/seggpt.md
@@ -26,7 +26,8 @@ The abstract from the paper is the following:
 
 Tips:
 - One can use [`SegGptImageProcessor`] to prepare image input, prompt and mask to the model.
-- It's highly advisable to pass `num_labels` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case.
+- One can either use segmentation maps or RGB images as prompt masks. If using the latter make sure to set `do_convert_rgb=False` in the `preprocess` method.
+- It's highly advisable to pass `num_labels` when using `segmetantion_maps` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case.
 - When doing inference with [`SegGptForImageSegmentation`] if your `batch_size` is greater than 1 you can use feature ensemble across your images by passing `feature_ensemble=True` in the forward method.
 
 Here's how to use the model for one-shot semantic segmentation:
@@ -53,7 +54,7 @@ mask_prompt = ds[29]["label"]
 inputs = image_processor(
     images=image_input, 
     prompt_images=image_prompt,
-    prompt_masks=mask_prompt, 
+    segmentation_maps=mask_prompt, 
     num_labels=num_labels,
     return_tensors="pt"
 )
diff --git a/src/transformers/models/seggpt/image_processing_seggpt.py b/src/transformers/models/seggpt/image_processing_seggpt.py
index 80fb94cdc7aaf4..1e4a5e23d093e8 100644
--- a/src/transformers/models/seggpt/image_processing_seggpt.py
+++ b/src/transformers/models/seggpt/image_processing_seggpt.py
@@ -26,19 +26,21 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    get_channel_dimension_axis,
     infer_channel_dimension_format,
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_torch_available, logging, requires_backends
+from ...utils import TensorType, is_torch_available, is_vision_available, logging, requires_backends
 
 
 if is_torch_available():
     import torch
 
+if is_vision_available():
+    pass
+
 
 logger = logging.get_logger(__name__)
 
@@ -65,29 +67,10 @@ def build_palette(num_labels: int) -> List[Tuple[int, int]]:
     return color_list
 
 
-def get_num_channels(image: np.ndarray, input_data_format: ChannelDimension) -> int:
-    if image.ndim == 2:
-        return 0
-
-    channel_idx = get_channel_dimension_axis(image, input_data_format)
-    return image.shape[channel_idx]
-
-
 def mask_to_rgb(
-    mask: np.ndarray,
-    palette: Optional[List[Tuple[int, int]]] = None,
-    input_data_format: Optional[ChannelDimension] = None,
-    data_format: Optional[ChannelDimension] = None,
+    mask: np.ndarray, palette: Optional[List[Tuple[int, int]]] = None, data_format: Optional[ChannelDimension] = None
 ) -> np.ndarray:
-    if input_data_format is None and mask.ndim > 2:
-        input_data_format = infer_channel_dimension_format(mask)
-
-    data_format = data_format if data_format is not None else input_data_format
-
-    num_channels = get_num_channels(mask, input_data_format)
-
-    if num_channels == 3:
-        return to_channel_dimension_format(mask, data_format, input_data_format) if data_format is not None else mask
+    data_format = data_format if data_format is not None else ChannelDimension.FIRST
 
     if palette is not None:
         height, width = mask.shape
@@ -109,9 +92,7 @@ def mask_to_rgb(
     else:
         rgb_mask = np.repeat(mask[None, ...], 3, axis=0)
 
-    return (
-        to_channel_dimension_format(rgb_mask, data_format, input_data_format) if data_format is not None else rgb_mask
-    )
+    return to_channel_dimension_format(rgb_mask, data_format)
 
 
 class SegGptImageProcessor(BaseImageProcessor):
@@ -143,6 +124,9 @@ class SegGptImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the prompt mask to RGB format. Can be overridden by the `do_convert_rgb` parameter in the
+            `preprocess` method.
     """
 
     model_input_names = ["pixel_values"]
@@ -157,6 +141,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -170,6 +155,7 @@ def __init__(
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_convert_rgb = do_convert_rgb
 
     def get_palette(self, num_labels: int) -> List[Tuple[int, int]]:
         """Build a palette to map the prompt mask from a single channel to a 3 channel RGB.
@@ -188,13 +174,12 @@ def mask_to_rgb(
         image: np.ndarray,
         palette: Optional[List[Tuple[int, int]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
-        """Convert a mask to RGB format.
+        """Converts a segmentation map to RGB format.
 
         Args:
             image (`np.ndarray`):
-                Mask to convert to RGB format. If the mask is already in RGB format, it will be passed through.
+                Segmentation map with dimensions (height, width) where pixel values represent the class index.
             palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
                 Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel
                 dimension.
@@ -203,21 +188,11 @@ def mask_to_rgb(
                 image is used. Can be one of:
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
 
         Returns:
             `np.ndarray`: The mask in RGB format.
         """
-        return mask_to_rgb(
-            image,
-            palette=palette,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
+        return mask_to_rgb(image, palette=palette, data_format=data_format)
 
     # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
@@ -271,7 +246,6 @@ def resize(
     def _preprocess_step(
         self,
         images: ImageInput,
-        is_mask: bool = False,
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -282,6 +256,7 @@ def _preprocess_step(
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
         num_labels: Optional[int] = None,
         **kwargs,
     ):
@@ -292,9 +267,6 @@ def _preprocess_step(
             images (`ImageInput`):
                 Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            is_mask (`bool`, *optional*, defaults to `False`):
-                Whether the image is a mask. If True, the image is converted to RGB using the palette if
-                `self.num_labels` is specified otherwise RGB is achieved by duplicating the channel.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -331,6 +303,10 @@ def _preprocess_step(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built
+                to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated
+                across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format.
             num_labels: (`int`, *optional*):
                 Number of classes in the segmentation task (excluding the background). If specified, a palette will be
                 built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx
@@ -340,6 +316,7 @@ def _preprocess_step(
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         resample = resample if resample is not None else self.resample
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
@@ -348,7 +325,8 @@ def _preprocess_step(
         size = size if size is not None else self.size
         size_dict = get_size_dict(size)
 
-        images = make_list_of_images(images)
+        # If segmentation map is passed we expect 2D images
+        images = make_list_of_images(images, expected_ndims=2 if do_convert_rgb else 3)
 
         if not valid_images(images):
             raise ValueError(
@@ -374,11 +352,11 @@ def _preprocess_step(
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
-        if input_data_format is None and not is_mask:
+        if input_data_format is None and not do_convert_rgb:
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if is_mask:
+        if do_convert_rgb:
             palette = self.get_palette(num_labels) if num_labels is not None else None
             # Since this is the input for the next transformations its format should be the same as the input_data_format
             images = [
@@ -423,6 +401,7 @@ def preprocess(
         do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
         num_labels: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
@@ -440,9 +419,12 @@ def preprocess(
                 Prompt image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             prompt_masks (`ImageInput`):
-                Prompt mask from prompt image to _preprocess. Expects a single or batch of masks. If the mask masks are
-                a single channel then it will be converted to RGB using the palette if `self.num_labels` is specified
-                or by just repeating the channel if not. If the mask is already in RGB format, it will be passed through.
+                Prompt mask from prompt image to _preprocess that specify prompt_masks value in the preprocessed output.
+                Can either be in the format of segmentation maps (no channels) or RGB images. If in the format of
+                RGB images, `do_convert_rgb` should be set to `False`. If in the format of segmentation maps, `num_labels`
+                specifying `num_labels` is recommended to build a palette to map the prompt mask from a single channel to
+                a 3 channel RGB. If `num_labels` is not specified, the prompt mask will be duplicated across the channel
+                dimension.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -461,6 +443,16 @@ def preprocess(
                 Image mean to use if `do_normalize` is set to `True`.
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation to use if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built
+                to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated
+                across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format.
+            num_labels: (`int`, *optional*):
+                Number of classes in the segmentation task (excluding the background). If specified, a palette will be
+                built, assuming that class_idx 0 is the background, to map the prompt mask from a plain segmentation map
+                with no channels to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
+                through as is if it is already in RGB format (if `do_convert_rgb` is false) or being duplicated
+                across the channel dimension.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                 - Unset: Return a list of `np.ndarray`.
@@ -479,11 +471,6 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            num_labels: (`int`, *optional*):
-                Number of classes in the segmentation task (excluding the background). If specified, a palette will be
-                built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx
-                channel to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
-                through as is if it is already in RGB format or being duplicated across the channel dimension.
         """
         if all(v is None for v in [images, prompt_images, prompt_masks]):
             raise ValueError("At least one of images, prompt_images, prompt_masks must be specified.")
@@ -502,6 +489,7 @@ def preprocess(
                 do_normalize=do_normalize,
                 image_mean=image_mean,
                 image_std=image_std,
+                do_convert_rgb=False,
                 data_format=data_format,
                 input_data_format=input_data_format,
                 **kwargs,
@@ -521,6 +509,7 @@ def preprocess(
                 do_normalize=do_normalize,
                 image_mean=image_mean,
                 image_std=image_std,
+                do_convert_rgb=False,
                 data_format=data_format,
                 input_data_format=input_data_format,
                 **kwargs,
@@ -531,7 +520,6 @@ def preprocess(
         if prompt_masks is not None:
             prompt_masks = self._preprocess_step(
                 prompt_masks,
-                is_mask=True,
                 do_resize=do_resize,
                 size=size,
                 resample=PILImageResampling.NEAREST,
@@ -540,9 +528,10 @@ def preprocess(
                 do_normalize=do_normalize,
                 image_mean=image_mean,
                 image_std=image_std,
+                do_convert_rgb=do_convert_rgb,
+                num_labels=num_labels,
                 data_format=data_format,
                 input_data_format=input_data_format,
-                num_labels=num_labels,
                 **kwargs,
             )
 
diff --git a/tests/models/seggpt/test_image_processing_seggpt.py b/tests/models/seggpt/test_image_processing_seggpt.py
index 46694d6636ea05..04cefb70d0efb4 100644
--- a/tests/models/seggpt/test_image_processing_seggpt.py
+++ b/tests/models/seggpt/test_image_processing_seggpt.py
@@ -30,6 +30,8 @@
     from transformers.models.seggpt.modeling_seggpt import SegGptImageSegmentationOutput
 
 if is_vision_available():
+    from PIL import Image
+
     from transformers import SegGptImageProcessor
 
 
@@ -147,7 +149,7 @@ def test_mask_equivalence(self):
         mask_rgb = mask_binary.convert("RGB")
 
         inputs_binary = image_processor(images=None, prompt_masks=mask_binary, return_tensors="pt")
-        inputs_rgb = image_processor(images=None, prompt_masks=mask_rgb, return_tensors="pt")
+        inputs_rgb = image_processor(images=None, prompt_masks=mask_rgb, return_tensors="pt", do_convert_rgb=False)
 
         self.assertTrue((inputs_binary["prompt_masks"] == inputs_rgb["prompt_masks"]).all().item())
 
@@ -196,7 +198,11 @@ def test_pixel_values(self):
         image_processor = SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large")
 
         inputs = image_processor(
-            images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt"
+            images=input_image,
+            prompt_images=prompt_image,
+            prompt_masks=prompt_mask,
+            return_tensors="pt",
+            do_convert_rgb=False,
         )
 
         # Verify pixel values
@@ -229,3 +235,76 @@ def test_pixel_values(self):
             torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
         )
         self.assertTrue(torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4))
+
+    def test_prompt_mask_equivalence(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        image_size = self.image_processor_tester.image_size
+
+        # Single Mask Examples
+        expected_single_shape = [1, 3, image_size, image_size]
+
+        # Single Semantic Map (2D)
+        image_np_2d = np.ones((image_size, image_size))
+        image_pt_2d = torch.ones((image_size, image_size))
+        image_pil_2d = Image.fromarray(image_np_2d)
+
+        inputs_np_2d = image_processor(images=None, prompt_masks=image_np_2d, return_tensors="pt")
+        inputs_pt_2d = image_processor(images=None, prompt_masks=image_pt_2d, return_tensors="pt")
+        inputs_pil_2d = image_processor(images=None, prompt_masks=image_pil_2d, return_tensors="pt")
+
+        self.assertTrue((inputs_np_2d["prompt_masks"] == inputs_pt_2d["prompt_masks"]).all().item())
+        self.assertTrue((inputs_np_2d["prompt_masks"] == inputs_pil_2d["prompt_masks"]).all().item())
+        self.assertEqual(list(inputs_np_2d["prompt_masks"].shape), expected_single_shape)
+
+        # Single RGB Images (3D)
+        image_np_3d = np.ones((3, image_size, image_size))
+        image_pt_3d = torch.ones((3, image_size, image_size))
+        image_pil_3d = Image.fromarray(image_np_3d.transpose(1, 2, 0).astype(np.uint8))
+
+        inputs_np_3d = image_processor(
+            images=None, prompt_masks=image_np_3d, return_tensors="pt", do_convert_rgb=False
+        )
+        inputs_pt_3d = image_processor(
+            images=None, prompt_masks=image_pt_3d, return_tensors="pt", do_convert_rgb=False
+        )
+        inputs_pil_3d = image_processor(
+            images=None, prompt_masks=image_pil_3d, return_tensors="pt", do_convert_rgb=False
+        )
+
+        self.assertTrue((inputs_np_3d["prompt_masks"] == inputs_pt_3d["prompt_masks"]).all().item())
+        self.assertTrue((inputs_np_3d["prompt_masks"] == inputs_pil_3d["prompt_masks"]).all().item())
+        self.assertEqual(list(inputs_np_3d["prompt_masks"].shape), expected_single_shape)
+
+        # Batched Examples
+        expected_batched_shape = [2, 3, image_size, image_size]
+
+        # Batched Semantic Maps (3D)
+        image_np_2d_batched = np.ones((2, image_size, image_size))
+        image_pt_2d_batched = torch.ones((2, image_size, image_size))
+
+        inputs_np_2d_batched = image_processor(images=None, prompt_masks=image_np_2d_batched, return_tensors="pt")
+        inputs_pt_2d_batched = image_processor(images=None, prompt_masks=image_pt_2d_batched, return_tensors="pt")
+
+        self.assertTrue((inputs_np_2d_batched["prompt_masks"] == inputs_pt_2d_batched["prompt_masks"]).all().item())
+        self.assertEqual(list(inputs_np_2d_batched["prompt_masks"].shape), expected_batched_shape)
+
+        # Batched RGB images
+        image_np_4d = np.ones((2, 3, image_size, image_size))
+        image_pt_4d = torch.ones((2, 3, image_size, image_size))
+
+        inputs_np_4d = image_processor(
+            images=None, prompt_masks=image_np_4d, return_tensors="pt", do_convert_rgb=False
+        )
+        inputs_pt_4d = image_processor(
+            images=None, prompt_masks=image_pt_4d, return_tensors="pt", do_convert_rgb=False
+        )
+
+        self.assertTrue((inputs_np_4d["prompt_masks"] == inputs_pt_4d["prompt_masks"]).all().item())
+        self.assertEqual(list(inputs_np_4d["prompt_masks"].shape), expected_batched_shape)
+
+        # Comparing Single and Batched Examples
+        self.assertTrue((inputs_np_2d["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
+        self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_2d["prompt_masks"][0]).all().item())
+        self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
+        self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_4d["prompt_masks"][0]).all().item())
+        self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py
index d43d4304532431..efa0231c1e817a 100644
--- a/tests/models/seggpt/test_modeling_seggpt.py
+++ b/tests/models/seggpt/test_modeling_seggpt.py
@@ -363,7 +363,11 @@ def test_one_shot_inference(self):
         prompt_mask = masks[0]
 
         inputs = image_processor(
-            images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt"
+            images=input_image,
+            prompt_images=prompt_image,
+            prompt_masks=prompt_mask,
+            return_tensors="pt",
+            do_convert_rgb=False,
         )
 
         inputs = inputs.to(torch_device)
@@ -404,7 +408,11 @@ def test_few_shot_inference(self):
         prompt_masks = [masks[0], masks[2]]
 
         inputs = image_processor(
-            images=input_images, prompt_images=prompt_images, prompt_masks=prompt_masks, return_tensors="pt"
+            images=input_images,
+            prompt_images=prompt_images,
+            prompt_masks=prompt_masks,
+            return_tensors="pt",
+            do_convert_rgb=False,
         )
 
         inputs = {k: v.to(torch_device) for k, v in inputs.items()}
@@ -437,10 +445,16 @@ def test_one_shot_with_label(self):
         prompt_mask = masks[0]
 
         inputs = image_processor(
-            images=input_image, prompt_masks=prompt_mask, prompt_images=prompt_image, return_tensors="pt"
+            images=input_image,
+            prompt_masks=prompt_mask,
+            prompt_images=prompt_image,
+            return_tensors="pt",
+            do_convert_rgb=False,
         ).to(torch_device)
 
-        labels = image_processor(images=None, prompt_masks=label, return_tensors="pt")["prompt_masks"].to(torch_device)
+        labels = image_processor(images=None, prompt_masks=label, return_tensors="pt", do_convert_rgb=False)[
+            "prompt_masks"
+        ].to(torch_device)
 
         bool_masked_pos = prepare_bool_masked_pos(model.config).to(torch_device)
 

From 73014b561d5f88d728e46a57d346f516fefe3f2d Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Fri, 26 Apr 2024 15:52:24 -0400
Subject: [PATCH 09/18] Fix link in dbrx.md (#30509)

---
 docs/source/en/model_doc/dbrx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 33435462b3e024..d60a4926eb1853 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -32,7 +32,7 @@ We used curriculum learning for pretraining, changing the data mix during traini
 More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
 
-This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
+This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx), though this may not be up to date.
 
 ## Usage Examples
 

From 80126f98d86bad81c56b30e31a959bd6e644e8be Mon Sep 17 00:00:00 2001
From: Howard Liberty <liberty@anymemo.org>
Date: Mon, 29 Apr 2024 02:03:26 -0700
Subject: [PATCH 10/18] Allow boolean FSDP options in fsdp_config (#30439)

* Allow boolean FSDP options in fsdp_config

* Use lower() to be safe
---
 src/transformers/training_args.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 12ae77908ebfae..18bf004efa05b0 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1840,12 +1840,12 @@ def __post_init__(self):
                         )
             prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
             os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
-            os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefetch", "false")
+            os.environ[f"{prefix}FORWARD_PREFETCH"] = str(self.fsdp_config.get("forward_prefetch", "false")).lower()
 
-            sync_module_states = self.fsdp_config.get("sync_module_states", "true")
-            cpu_ram_efficient_loading = self.fsdp_config.get("cpu_ram_efficient_loading", "false")
+            sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower()
+            cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower()
 
-            if str(sync_module_states).lower() == "false" and str(cpu_ram_efficient_loading).lower() == "true":
+            if sync_module_states == "false" and cpu_ram_efficient_loading == "true":
                 # In this case, all the processes except the main process would have random weights leading
                 # to unexpected behaviour during training, thus throwing error here to prevent it.
                 raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`')
@@ -1853,7 +1853,7 @@ def __post_init__(self):
             os.environ[f"{prefix}SYNC_MODULE_STATES"] = sync_module_states
             os.environ[f"{prefix}CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading
 
-            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "true")
+            os.environ[f"{prefix}USE_ORIG_PARAMS"] = str(self.fsdp_config.get("use_orig_params", "true")).lower()
 
         if is_accelerate_available():
             if not isinstance(self.accelerator_config, (AcceleratorConfig)):

From e8acb70015288f719ebb1a37cdb935d650fdb6a1 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:22:33 +0100
Subject: [PATCH 11/18] Pass attn_implementation when using AutoXXX.from_config
 (#30507)

* Pass attn_implementation when using AutoXXX.from_config

* Fix
---
 .../models/blip_2/modeling_blip_2.py             | 16 ++++++++++++----
 .../depth_anything/modeling_depth_anything.py    |  4 +++-
 .../encoder_decoder/modeling_encoder_decoder.py  |  4 ++--
 src/transformers/models/fuyu/modeling_fuyu.py    |  4 +++-
 .../models/idefics2/modeling_idefics2.py         |  2 +-
 .../models/instructblip/modeling_instructblip.py |  8 ++++++--
 src/transformers/models/rag/modeling_rag.py      |  8 ++++++--
 .../modeling_speech_encoder_decoder.py           |  4 ++--
 .../modeling_vision_encoder_decoder.py           |  4 ++--
 .../modeling_vision_text_dual_encoder.py         |  6 ++++--
 10 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 935e041eb8360d..edd0d9a6d76133 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1194,9 +1194,13 @@ def __init__(self, config: Blip2Config):
 
         self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
         if config.use_decoder_only_language_model:
-            language_model = AutoModelForCausalLM.from_config(config.text_config)
+            language_model = AutoModelForCausalLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
         else:
-            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+            language_model = AutoModelForSeq2SeqLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
 
         # Update _tied_weights_keys using the base model used.
         if language_model._tied_weights_keys is not None:
@@ -1549,9 +1553,13 @@ def __init__(self, config: Blip2Config):
 
         self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
         if config.use_decoder_only_language_model:
-            language_model = AutoModelForCausalLM.from_config(config.text_config)
+            language_model = AutoModelForCausalLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
         else:
-            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+            language_model = AutoModelForSeq2SeqLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
 
         # Update _tied_weights_keys using the base model used.
         if language_model._tied_weights_keys is not None:
diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py
index 788b0d911396f1..bed91ac2a482bc 100644
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -367,7 +367,9 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        self.backbone = AutoBackbone.from_config(
+            config.backbone_config, attn_implementation=config._attn_implementation
+        )
         self.neck = DepthAnythingNeck(config)
         self.head = DepthAnythingDepthEstimationHead(config)
 
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 16248fee64ce59..2b185cc14a03dd 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -209,12 +209,12 @@ def __init__(
         if encoder is None:
             from ..auto.modeling_auto import AutoModel
 
-            encoder = AutoModel.from_config(config.encoder)
+            encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation)
 
         if decoder is None:
             from ..auto.modeling_auto import AutoModelForCausalLM
 
-            decoder = AutoModelForCausalLM.from_config(config.decoder)
+            decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation)
 
         self.encoder = encoder
         self.decoder = decoder
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index 8e9a41954aee9c..bdaec5f868505c 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -149,7 +149,9 @@ def __init__(self, config: FuyuConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
 
         self.vision_embed_tokens = nn.Linear(
             config.patch_size * config.patch_size * config.num_channels, config.hidden_size
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 28cd6155548ac7..7f61e95a9b8edf 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1476,7 +1476,7 @@ def __init__(self, config: Idefics2Config):
 
         self.vision_model = Idefics2VisionTransformer(config.vision_config)
         self.connector = Idefics2Connector(config)
-        self.text_model = AutoModel.from_config(config.text_config)
+        self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation)
 
         self.image_seq_len = config.perceiver_config.resampler_n_latents
         self.image_token_id = self.config.image_token_id
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index b18d46723179e2..52f8fa610a948e 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1251,9 +1251,13 @@ def __init__(self, config: InstructBlipConfig):
         self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
 
         if config.use_decoder_only_language_model:
-            language_model = AutoModelForCausalLM.from_config(config.text_config)
+            language_model = AutoModelForCausalLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
         else:
-            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+            language_model = AutoModelForSeq2SeqLM.from_config(
+                config.text_config, attn_implementation=config._attn_implementation
+            )
 
         if language_model._no_split_modules is not None:
             self._no_split_modules.extend(language_model._no_split_modules)
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 80dec5bc3dba58..8a6f959a4921ed 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -506,12 +506,16 @@ def __init__(
         if question_encoder is None:
             from ..auto.modeling_auto import AutoModel
 
-            question_encoder = AutoModel.from_config(config.question_encoder)
+            question_encoder = AutoModel.from_config(
+                config.question_encoder, attn_implementation=config._attn_implementation
+            )
 
         if generator is None:
             from ..auto.modeling_auto import AutoModelForSeq2SeqLM
 
-            generator = AutoModelForSeq2SeqLM.from_config(config.generator)
+            generator = AutoModelForSeq2SeqLM.from_config(
+                config.generator, attn_implementation=config._attn_implementation
+            )
 
         self.retriever = retriever
         if self.retriever is not None:
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 942dfb5f9c49fc..77b69afe8fd22f 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -212,10 +212,10 @@ def __init__(
         super().__init__(config)
 
         if encoder is None:
-            encoder = AutoModel.from_config(config.encoder)
+            encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation)
 
         if decoder is None:
-            decoder = AutoModelForCausalLM.from_config(config.decoder)
+            decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation)
 
         self.encoder = encoder
         self.decoder = decoder
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 0bdf76044153b1..fc72eb1cbdf831 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -190,10 +190,10 @@ def __init__(
         super().__init__(config)
 
         if encoder is None:
-            encoder = AutoModel.from_config(config.encoder)
+            encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation)
 
         if decoder is None:
-            decoder = AutoModelForCausalLM.from_config(config.decoder)
+            decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation)
 
         self.encoder = encoder
         self.decoder = decoder
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index cd4d5bd7a1f197..0f82bdd0c3e4d8 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -185,10 +185,12 @@ def __init__(
             if isinstance(config.vision_config, CLIPVisionConfig):
                 vision_model = CLIPVisionModel(config.vision_config)
             else:
-                vision_model = AutoModel.from_config(config.vision_config)
+                vision_model = AutoModel.from_config(
+                    config.vision_config, attn_implementation=config._attn_implementation
+                )
 
         if text_model is None:
-            text_model = AutoModel.from_config(config.text_config)
+            text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation)
 
         self.vision_model = vision_model
         self.text_model = text_model

From bdbe1662113b49e81c865f0ca9d3922b500c5414 Mon Sep 17 00:00:00 2001
From: clinty <clint@gcfm.net>
Date: Mon, 29 Apr 2024 05:57:51 -0400
Subject: [PATCH 12/18] Fix broken link to Transformers notebooks (#30512)

Co-authored-by: Clint Adams <clint@debian.org>
---
 docs/source/de/run_scripts.md | 2 +-
 docs/source/en/run_scripts.md | 2 +-
 docs/source/es/run_scripts.md | 2 +-
 docs/source/it/run_scripts.md | 2 +-
 docs/source/pt/run_scripts.md | 2 +-
 docs/source/zh/run_scripts.md | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md
index 61a0754ea92628..17b725827dd7ec 100644
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Trainieren mit einem Skript
 
-Neben den 🤗 Transformers [notebooks](./noteboks/README) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
+Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
 
 Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
 
diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md
index 845befc5638133..f602cde40933d0 100644
--- a/docs/source/en/run_scripts.md
+++ b/docs/source/en/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Train with a script
 
-Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+Along with the 🤗 Transformers [notebooks](./notebooks), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
 You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library.
 
diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md
index ff1afa340c9a1d..d9a2b142a8ab6c 100644
--- a/docs/source/es/run_scripts.md
+++ b/docs/source/es/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Entrenamiento con scripts
 
-Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+Junto con los [notebooks](./notebooks) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
 También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
 
diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md
index 7fc3fb6c6ac67a..b437efb9fb18f6 100644
--- a/docs/source/it/run_scripts.md
+++ b/docs/source/it/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Addestramento con script
 
-Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+Insieme ai [notebooks](./notebooks) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
 Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria.
 
diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md
index a64ad72f1dbc61..d4cc3973608d2c 100644
--- a/docs/source/pt/run_scripts.md
+++ b/docs/source/pt/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Treinamento a partir de um script
 
-Junto com os 🤗 Transformers [notebooks](./noteboks/README), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+Junto com os 🤗 Transformers [notebooks](./notebooks), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
 Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca.
 
diff --git a/docs/source/zh/run_scripts.md b/docs/source/zh/run_scripts.md
index b6e9c8ea6a2d89..d058e97d1ad587 100644
--- a/docs/source/zh/run_scripts.md
+++ b/docs/source/zh/run_scripts.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # 使用脚本进行训练
 
-除了 🤗 Transformers [notebooks](./noteboks/README)，还有示例脚本演示了如何使用[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)或[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)训练模型以解决特定任务。
+除了 🤗 Transformers [notebooks](./notebooks)，还有示例脚本演示了如何使用[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)或[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)训练模型以解决特定任务。
 
 您还可以在这些示例中找到我们在[研究项目](https://github.com/huggingface/transformers/tree/main/examples/research_projects)和[遗留示例](https://github.com/huggingface/transformers/tree/main/examples/legacy)中使用过的脚本，这些脚本主要是由社区贡献的。这些脚本已不再被积极维护，需要使用特定版本的🤗 Transformers， 可能与库的最新版本不兼容。
 

From c02421883b2a59e075ec87de8d82f02a944fb5e8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:07:41 +0200
Subject: [PATCH 13/18] Update runner tag for PR slow CI (#30535)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-new-model-pr-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-new-model-pr-caller.yml b/.github/workflows/self-new-model-pr-caller.yml
index 888b4f7a8ce5ac..44a54ac675279b 100644
--- a/.github/workflows/self-new-model-pr-caller.yml
+++ b/.github/workflows/self-new-model-pr-caller.yml
@@ -46,7 +46,7 @@ jobs:
         matrix:
           folders: ["${{ needs.check_for_new_model.outputs.new_model }}"]
           machine_type: [single-gpu, multi-gpu]
-      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci]
       container:
         image: huggingface/transformers-all-latest-gpu
         options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

From 87be06ca77166e6a6215eee5a990ab9f07238a18 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:32:43 +0200
Subject: [PATCH 14/18] Fix repo. fetch/checkout in PR slow CI job (#30537)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-new-model-pr-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-new-model-pr-caller.yml b/.github/workflows/self-new-model-pr-caller.yml
index 44a54ac675279b..3bd19b6f2c3ff5 100644
--- a/.github/workflows/self-new-model-pr-caller.yml
+++ b/.github/workflows/self-new-model-pr-caller.yml
@@ -69,7 +69,7 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.event.pull_request.head.sha }}
+        run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers

From 9df8b301ceb6697a713fbe4034dc02395e5b28cd Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Mon, 29 Apr 2024 11:45:43 -0500
Subject: [PATCH 15/18] Reenable SDPA's FA2 During Training with torch.compile
 (#30442)

* Reenable SDPA's FA2 during training with torch.compile

* fix Olmo's SDPA FA2 dispatching too

* update formatting

* improved SDPA comment

* formatting and explanatory comment

* is_causal if statement to one-liner
---
 src/transformers/modeling_attn_mask_utils.py      |  7 ++++---
 src/transformers/models/cohere/modeling_cohere.py | 13 +++++++++----
 src/transformers/models/gemma/modeling_gemma.py   | 13 +++++++++----
 src/transformers/models/llama/modeling_llama.py   | 13 +++++++++----
 src/transformers/models/olmo/modeling_olmo.py     | 11 +++++++++--
 5 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 44ea1795669f58..8dcf40268d0324 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -240,6 +240,7 @@ def _ignore_causal_mask_sdpa(
         inputs_embeds: torch.Tensor,
         past_key_values_length: int,
         sliding_window: Optional[int] = None,
+        is_training: bool = False,
     ) -> bool:
         """
         Detects whether the optional user-specified attention_mask & the automatically created causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.
@@ -263,11 +264,11 @@ def _ignore_causal_mask_sdpa(
         if attention_mask is None:
             # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input shape, thus SDPA's `is_causal` argument is rightfully updated (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using `torch.export` or
             # or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
-            # Thus, we currently can NOT set `ignore_causal_mask = True` here. We would need a `torch._dynamo.is_exporting()` flag.
+            # Thus, we only set `ignore_causal_mask = True` if the model is set to training.
             #
             # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`).
             if (
-                not is_tracing
+                (is_training or not is_tracing)
                 and (query_length == 1 or key_value_length == query_length)
                 and (sliding_window is None or key_value_length < sliding_window)
             ):
@@ -279,7 +280,7 @@ def _ignore_causal_mask_sdpa(
                     raise ValueError(
                         f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
                     )
-            elif not is_tracing and torch.all(attention_mask == 1):
+            elif (is_training or not is_tracing) and torch.all(attention_mask == 1):
                 if query_length == 1 or key_value_length == query_length:
                     # For query_length == 1, causal attention and bi-directional attention are the same.
                     ignore_causal_mask = True
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 41bb4c0516928c..3d529fd1ec4fe0 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -590,15 +590,17 @@ def forward(
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
-        # relying on the `is_causal` argument.
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -996,7 +998,10 @@ def _update_causal_mask(
             # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument,
             # in order to dispatch on Flash Attention 2.
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
             ):
                 return None
 
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index e5b6b207748a53..97e4e5d49f8e0f 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -570,15 +570,17 @@ def forward(
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
-        # relying on the `is_causal` argument.
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -982,7 +984,10 @@ def _update_causal_mask(
             # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument,
             # in order to dispatch on Flash Attention 2.
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
             ):
                 return None
 
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 905edf5f71a63d..9a2566f2fdd2eb 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -666,15 +666,17 @@ def forward(
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
-        # relying on the `is_causal` argument.
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -1074,7 +1076,10 @@ def _update_causal_mask(
             # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument,
             # in order to dispatch on Flash Attention 2.
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
             ):
                 return None
 
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index e3b0e05127c52d..87db966e2d8f67 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -647,13 +647,17 @@ def forward(
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -1057,7 +1061,10 @@ def _update_causal_mask(
             # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument,
             # in order to dispatch on Flash Attention 2.
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
             ):
                 return None
 

From a3aabc702e1c49243e7b48f22d88362d50e786c5 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 29 Apr 2024 14:47:26 -0400
Subject: [PATCH 16/18] Include safetensors as part of `_load_best_model`
 (#30553)

* Include safetensors

* Cleanup
---
 src/transformers/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1d3c164984ea1c..d967c9314b1025 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2611,7 +2611,9 @@ def _load_best_model(self):
                     load_result = model.load_state_dict(state_dict, False)
                 if not is_sagemaker_mp_enabled() and has_been_loaded:
                     self._issue_warnings_after_load(load_result)
-        elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
+        elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists(
+            os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)
+        ):
             load_result = load_sharded_checkpoint(
                 model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled()
             )

From c712d05aa8fc8ba3ebe465079bd377d2dc9c2e07 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Tue, 30 Apr 2024 12:16:18 +0500
Subject: [PATCH 17/18] Pass `use_cache` in kwargs for GPTNeoX (#30538)

pass use_cache in kwargs
---
 src/transformers/models/gpt_neox/modeling_gpt_neox.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 83c99202ac9379..e338c529abf293 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -1108,6 +1108,7 @@ def prepare_inputs_for_generation(
                 "attention_mask": attention_mask,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
+                "use_cache": kwargs.get("use_cache"),
             }
         )
 

From 0ae789e04330e15a90e34cd723c851a8ab8d7ec5 Mon Sep 17 00:00:00 2001
From: Jacky Lee <39754370+jla524@users.noreply.github.com>
Date: Tue, 30 Apr 2024 04:09:08 -0700
Subject: [PATCH 18/18] Enable multi-device for more models (#30409)

* feat: support for dinov2

* feat: support for depth_anything

* feat: support for efficientformer

* feat: support for bert (is this right?)

* update: embedding split

* remove: empty string

* feat: support for align

* fix: copies

* fix: QAQBertEmbeddings

* fix: more consistency issues

* revert: support for effientformer

* feat: support for altclip

* feat: support for blip_text

* support for ChineseCLIP

* feat: support for depth anything

* feat: support for dpt

* feat: support for dpt

* feat: support for git

* feat: support for groupvit

* update: format

* fix: support for clip

* fix: consistency

* feat: support for pvt

* feat: support for vit_msn

* fix: consistency

* fix: other copies

* remove: device transfer

* revert: in-place add

* update: support for align

* update: support for bert

* update: support for Chinese CLIP

* revert: changes to efficientformer

* update: support for dpt

* update: support for efficientformer

* revert: changes to git

* revert: changes to groupvit

* revert: changes to roc_bert

* update: support for vit_msn

* revert: changes to dpt

* remove: extra space

* style: extra space
---
 src/transformers/models/align/modeling_align.py                 | 1 +
 src/transformers/models/altclip/modeling_altclip.py             | 1 +
 src/transformers/models/bert/modeling_bert.py                   | 2 ++
 src/transformers/models/chinese_clip/modeling_chinese_clip.py   | 2 ++
 .../models/depth_anything/modeling_depth_anything.py            | 2 ++
 src/transformers/models/dinov2/modeling_dinov2.py               | 1 +
 .../models/efficientformer/modeling_efficientformer.py          | 1 +
 src/transformers/models/pvt/modeling_pvt.py                     | 1 +
 src/transformers/models/vit_msn/modeling_vit_msn.py             | 1 +
 9 files changed, 12 insertions(+)

diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 0f8246e8f98c90..4fa128a5f67fa8 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -1203,6 +1203,7 @@ def _init_weights(self, module):
 )
 class AlignTextModel(AlignPreTrainedModel):
     config_class = AlignTextConfig
+    _no_split_modules = ["AlignTextEmbeddings"]
 
     def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True):
         super().__init__(config)
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index ba8abb311a8d2a..3e184085331720 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1034,6 +1034,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
     config_class = AltCLIPConfig
     base_model_prefix = "altclip"
     supports_gradient_checkpointing = True
+    _no_split_module = []
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index f7af0f1ef5a48c..9e2847b11b53f0 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -962,6 +962,8 @@ class BertModel(BertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
+    _no_split_modules = ["BertEmbeddings"]
+
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 87a1baa217baf1..7d5c8f2fcc855d 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -1113,6 +1113,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel):
     """
 
     config_class = ChineseCLIPTextConfig
+    _no_split_modules = ["ChineseCLIPTextEmbeddings"]
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -1284,6 +1285,7 @@ def forward(
 class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
     config_class = ChineseCLIPVisionConfig
     main_input_name = "pixel_values"
+    _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPVisionAttention"]
 
     def __init__(self, config: ChineseCLIPVisionConfig):
         super().__init__(config)
diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py
index bed91ac2a482bc..043bd0fac807b2 100644
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -364,6 +364,8 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width)
     DEPTH_ANYTHING_START_DOCSTRING,
 )
 class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
+    _no_split_modules = ["DPTViTEmbeddings"]
+
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index c25022f6ec22d8..c90221f145d4ba 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -481,6 +481,7 @@ class Dinov2PreTrainedModel(PreTrainedModel):
     base_model_prefix = "dinov2"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2SwiGLUFFN"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
index 70075cff55d7d9..cc62e9cbd21e40 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -555,6 +555,7 @@ class EfficientFormerModel(EfficientFormerPreTrainedModel):
     def __init__(self, config: EfficientFormerConfig):
         super().__init__(config)
         self.config = config
+        _no_split_modules = ["EfficientFormerMeta4D"]
 
         self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
         self.encoder = EfficientFormerEncoder(config)
diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py
index b169af0cbd5668..4574ca37876039 100755
--- a/src/transformers/models/pvt/modeling_pvt.py
+++ b/src/transformers/models/pvt/modeling_pvt.py
@@ -462,6 +462,7 @@ class PvtPreTrainedModel(PreTrainedModel):
     config_class = PvtConfig
     base_model_prefix = "pvt"
     main_input_name = "pixel_values"
+    _no_split_modules = []
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index 9c2269a3ae546f..424d657dc87859 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -421,6 +421,7 @@ class ViTMSNPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTMSNAttention"]
 
     # todo: Resort to https://github.com/facebookresearch/msn/blob/main/src/deit.py#L200-#L211
     # when creating pre-training scripts.