From 2de5cb12be7b7b3d5cf1f51880897a40879cb805 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 26 Apr 2024 16:14:53 +0100 Subject: [PATCH 01/18] Use the Keras set_random_seed in tests (#30504) Use the Keras set_random_seed to ensure reproducible weight initialization --- tests/pipelines/test_pipelines_common.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 7b7301d6d8cd0c..c680b4c634de40 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -541,11 +541,10 @@ def test_load_default_pipelines_pt(self): @slow @require_tf def test_load_default_pipelines_tf(self): - import tensorflow as tf - + from transformers.modeling_tf_utils import keras from transformers.pipelines import SUPPORTED_TASKS - set_seed_fn = lambda: tf.random.set_seed(0) # noqa: E731 + set_seed_fn = lambda: keras.utils.set_random_seed(0) # noqa: E731 for task in SUPPORTED_TASKS.keys(): if task == "table-question-answering": # test table in seperate test due to more dependencies @@ -553,7 +552,7 @@ def test_load_default_pipelines_tf(self): self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf) - # clean-up as much as possible GPU memory occupied by PyTorch + # clean-up as much as possible GPU memory occupied by TF gc.collect() @slow From dfa7b580e9863c38c2f0e0dedf0958c2eab9cb48 Mon Sep 17 00:00:00 2001 From: "JB (Don)" <1557853+hackyon@users.noreply.github.com> Date: Fri, 26 Apr 2024 23:23:44 +0800 Subject: [PATCH 02/18] [`BERT`] Add support for sdpa (#28802) * Adding SDPA support for BERT * Using the proper input name for testing model input in inference() * Adding documentation for SDPA in BERT model page * Use the stable link for the documentation * Adding a gate to only call .contiguous() for torch < 2.2.0 * Additions and fixes to the documentation * Minor updates to documentation * Adding extra requirements needed for the contiguous() bug * Adding "Adapted from" in plcae of the "Copied from" * Add benchmark speedup tables to the documentation * Minor fixes to the documentation * Use ClapText as a replacemenet for Bert in the Copied-From * Some more fixes for the fix-copies references * Overriding the test_eager_matches_sdpa_generate in bert tests to not load with low_cpu_mem_usage [test all] * Undo changes to separate test * Refactored SDPA self attention code for KV projections * Change use_sdpa to attn_implementation * Fix test_sdpa_can_dispatch_on_flash by preparing input (required for MultipleChoice models) --- docs/source/en/model_doc/bert.md | 47 +++++ docs/source/en/perf_infer_gpu_one.md | 12 +- src/transformers/modeling_attn_mask_utils.py | 6 +- .../models/align/modeling_align.py | 11 +- .../models/altclip/modeling_altclip.py | 15 +- src/transformers/models/bert/modeling_bert.py | 177 ++++++++++++++++-- .../modeling_bert_generation.py | 11 +- .../bridgetower/modeling_bridgetower.py | 11 +- .../models/camembert/modeling_camembert.py | 15 +- .../chinese_clip/modeling_chinese_clip.py | 11 +- src/transformers/models/clap/modeling_clap.py | 13 +- .../models/data2vec/modeling_data2vec_text.py | 13 +- .../models/electra/modeling_electra.py | 11 +- .../models/ernie/modeling_ernie.py | 13 +- src/transformers/models/git/modeling_git.py | 11 +- .../models/layoutlm/modeling_layoutlm.py | 11 +- .../models/markuplm/modeling_markuplm.py | 13 +- .../models/realm/modeling_realm.py | 11 +- .../models/roberta/modeling_roberta.py | 15 +- .../models/roc_bert/modeling_roc_bert.py | 14 +- .../models/splinter/modeling_splinter.py | 11 +- .../xlm_roberta/modeling_xlm_roberta.py | 15 +- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 4 +- src/transformers/models/xmod/modeling_xmod.py | 2 +- tests/models/bert/test_modeling_bert.py | 82 +++++++- tests/test_modeling_common.py | 26 ++- 26 files changed, 495 insertions(+), 86 deletions(-) diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md index c77a1d85252549..b6e99d1031e8f2 100644 --- a/docs/source/en/model_doc/bert.md +++ b/docs/source/en/model_doc/bert.md @@ -61,6 +61,53 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o - The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. The model has to predict if the sentences are consecutive or not. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import BertModel + +model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa") +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-80GB, CPUx12, RAM 96.6GB, PyTorch 2.2.0, OS Ubuntu 22.04) with `float16`, we saw the +following speedups during training and inference. + +#### Training + +|batch_size|seq_len|Time per batch (eager - s)|Time per batch (sdpa - s)|Speedup (%)|Eager peak mem (MB)|sdpa peak mem (MB)|Mem saving (%)| +|----------|-------|--------------------------|-------------------------|-----------|-------------------|------------------|--------------| +|4 |256 |0.023 |0.017 |35.472 |939.213 |764.834 |22.800 | +|4 |512 |0.023 |0.018 |23.687 |1970.447 |1227.162 |60.569 | +|8 |256 |0.023 |0.018 |23.491 |1594.295 |1226.114 |30.028 | +|8 |512 |0.035 |0.025 |43.058 |3629.401 |2134.262 |70.054 | +|16 |256 |0.030 |0.024 |25.583 |2874.426 |2134.262 |34.680 | +|16 |512 |0.064 |0.044 |46.223 |6964.659 |3961.013 |75.830 | + +#### Inference + +|batch_size|seq_len|Per token latency eager (ms)|Per token latency SDPA (ms)|Speedup (%)|Mem eager (MB)|Mem BT (MB)|Mem saved (%)| +|----------|-------|----------------------------|---------------------------|-----------|--------------|-----------|-------------| +|1 |128 |5.736 |4.987 |15.022 |282.661 |282.924 |-0.093 | +|1 |256 |5.689 |4.945 |15.055 |298.686 |298.948 |-0.088 | +|2 |128 |6.154 |4.982 |23.521 |314.523 |314.785 |-0.083 | +|2 |256 |6.201 |4.949 |25.303 |347.546 |347.033 |0.148 | +|4 |128 |6.049 |4.987 |21.305 |378.895 |379.301 |-0.107 | +|4 |256 |6.285 |5.364 |17.166 |443.209 |444.382 |-0.264 | + + + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 494ba660fa763d..64583e4badf044 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -187,10 +187,11 @@ FlashAttention is more memory efficient, meaning you can train on much larger se ## PyTorch scaled dot product attention -PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. +PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. You may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. For now, Transformers supports SDPA inference and training for the following architectures: * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) +* [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) @@ -224,6 +225,13 @@ FlashAttention can only be used for models with the `fp16` or `bf16` torch type, + + +SDPA does not support certain sets of attention parameters, such as `head_mask` and `output_attentions=True`. +In that case, you should see a warning message and we will fall back to the (slower) eager implementation. + + + By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager: ```diff @@ -232,8 +240,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda") -# convert the model to BetterTransformer -model.to_bettertransformer() input_text = "Hello my dog is cute and" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py index c69d9555b2afc8..44ea1795669f58 100755 --- a/src/transformers/modeling_attn_mask_utils.py +++ b/src/transformers/modeling_attn_mask_utils.py @@ -445,10 +445,8 @@ def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) ) - if torch.all(mask == 1): - if is_tracing: - pass - elif tgt_len == 1: + if not is_tracing and torch.all(mask == 1): + if tgt_len == 1: # For query_length == 1, causal attention and bi-directional attention are the same. return None elif key_value_length == tgt_len: diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 3dce9d383da151..0f8246e8f98c90 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -883,11 +883,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText +ALIGN_TEXT_SELF_ATTENTION_CLASSES = { + "eager": AlignTextSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText,BERT->ALIGN_TEXT class AlignTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = AlignTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ALIGN_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = AlignTextSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 0d27d87de7f4f1..ba8abb311a8d2a 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -434,11 +434,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->AltRoberta +ALT_ROBERTA_SELF_ATTENTION_CLASSES = { + "eager": AltRobertaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->AltRoberta,ROBERTA->ALT_ROBERTA class AltRobertaAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = AltRobertaSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ALT_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = AltRobertaSelfOutput(config) self.pruned_heads = set() @@ -1205,7 +1212,7 @@ class AltRobertaModel(AltCLIPPreTrainedModel): config_class = AltCLIPTextConfig - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->AltRoberta + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->AltRoberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -1232,7 +1239,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 262fc79f0d4039..f7af0f1ef5a48c 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -23,10 +23,15 @@ import torch import torch.utils.checkpoint +from packaging import version from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask_for_sdpa, +) from ...modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, @@ -45,6 +50,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + get_torch_version, logging, replace_return_docstrings, ) @@ -350,6 +356,103 @@ def forward( return outputs +class BertSdpaSelfAttention(BertSelfAttention): + def __init__(self, config, position_embedding_type=None): + super().__init__(config, position_embedding_type=position_embedding_type) + self.dropout_prob = config.attention_probs_dropout_prob + self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") + + # Adapted from BertSelfAttention + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention + # mask needs to be such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning + if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: + key_layer, value_layer = past_key_value + else: + key_layer = self.transpose_for_scores(self.key(current_states)) + value_layer = self.transpose_for_scores(self.value(current_states)) + if past_key_value is not None and not is_cross_attention: + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal + # mask in case tgt_len == 1. + is_causal = self.is_decoder and attention_mask is None and tgt_len > 1 + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + outputs = (attn_output,) + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() @@ -364,10 +467,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states +BERT_SELF_ATTENTION_CLASSES = { + "eager": BertSelfAttention, + "sdpa": BertSdpaSelfAttention, +} + + class BertAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = BERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = BertSelfOutput(config) self.pruned_heads = set() @@ -715,6 +826,7 @@ class BertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" supports_gradient_checkpointing = True + _supports_sdpa = True def _init_weights(self, module): """Initialize the weights""" @@ -859,6 +971,9 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = BertPooler(config) if add_pooling_layer else None + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + # Initialize weights and apply final processing self.post_init() @@ -945,9 +1060,6 @@ def forward( # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - if attention_mask is None: - attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) - if token_type_ids is None: if hasattr(self.embeddings, "token_type_ids"): buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] @@ -956,9 +1068,43 @@ def forward( else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -967,7 +1113,15 @@ def forward( encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + + if use_sdpa_attention_masks: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -978,13 +1132,6 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - embedding_output = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, - ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index b7250f6f7b926f..73c4d1d1e5da91 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -192,11 +192,18 @@ def forward( return outputs -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration +BERT_GENERATION_SELF_ATTENTION_CLASSES = { + "eager": BertGenerationSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration,BERT->BERT_GENERATION class BertGenerationAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = BertGenerationSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = BERT_GENERATION_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = BertGenerationSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index bcace39b299bcf..3fc9f755aab9e2 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -562,11 +562,18 @@ def forward( return outputs -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower +BRIDGE_TOWER_SELF_ATTENTION_CLASSES = { + "eager": BridgeTowerSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower,BERT->BRIDGE_TOWER class BridgeTowerAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = BridgeTowerSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = BRIDGE_TOWER_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = BridgeTowerSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index 26250896b23d8a..f399fb3f5cfb9b 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -312,11 +312,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert +CAMEMBERT_SELF_ATTENTION_CLASSES = { + "eager": CamembertSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT class CamembertAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = CamembertSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = CamembertSelfOutput(config) self.pruned_heads = set() @@ -745,7 +752,7 @@ class CamembertModel(CamembertPreTrainedModel): _no_split_modules = [] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Camembert def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -778,7 +785,7 @@ class PreTrainedModel output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index d8e97c20b24cd0..87a1baa217baf1 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -354,11 +354,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ChineseCLIPText +CHINESE_CLIP_TEXT_SELF_ATTENTION_CLASSES = { + "eager": ChineseCLIPTextSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ChineseCLIPText,BERT->CHINESE_CLIP_TEXT class ChineseCLIPTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = ChineseCLIPTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = CHINESE_CLIP_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = ChineseCLIPTextSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 7b20b30137d2cb..c21e173133a17f 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1376,11 +1376,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText +CLAP_TEXT_SELF_ATTENTION_CLASSES = { + "eager": ClapTextSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText,BERT->CLAP_TEXT class ClapTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = CLAP_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = ClapTextSelfOutput(config) self.pruned_heads = set() @@ -1763,7 +1770,6 @@ class ClapTextModel(ClapPreTrainedModel): config_class = ClapTextConfig - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -1782,7 +1788,6 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value - # Copied from transformers.models.bert.modeling_bert.BertModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 7dcc53e2cc15c8..20e1e1eca5ffab 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -298,11 +298,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText +DATA2VEC_TEXT_SELF_ATTENTION_CLASSES = { + "eager": Data2VecTextSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText,BERT->DATA2VEC_TEXT class Data2VecTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = DATA2VEC_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = Data2VecTextSelfOutput(config) self.pruned_heads = set() @@ -727,7 +734,7 @@ class PreTrainedModel output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 2138aa97c6dca9..6fbdda2579c1a4 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -355,11 +355,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra +ELECTRA_SELF_ATTENTION_CLASSES = { + "eager": ElectraSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA class ElectraAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = ElectraSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ELECTRA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = ElectraSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index a65f453205d5c5..3db6501985604d 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -285,11 +285,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie +ERNIE_SELF_ATTENTION_CLASSES = { + "eager": ErnieSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie,BERT->ERNIE class ErnieAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = ErnieSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ERNIE_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = ErnieSelfOutput(config) self.pruned_heads = set() @@ -787,7 +794,7 @@ class ErnieModel(ErniePreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Ernie + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Ernie def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index c8953d498428ea..12821609f037bf 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -267,11 +267,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states +GIT_SELF_ATTENTION_CLASSES = { + "eager": GitSelfAttention, +} + + class GitAttention(nn.Module): - # Copied from transformers.models.bert.modeling_bert.BertAttention.__init__ with Bert->Git + # Copied from transformers.models.bert.modeling_bert.BertAttention.__init__ with Bert->Git,BERT->GIT def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = GitSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = GitSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index c570fdb124adc1..6914f5ee3efb62 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -276,11 +276,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->LayoutLM +LAYOUTLM_SELF_ATTENTION_CLASSES = { + "eager": LayoutLMSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->LayoutLM,BERT->LAYOUTLM class LayoutLMAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = LayoutLMSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = LAYOUTLM_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = LayoutLMSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 2058ce27951676..318110daf5d8d1 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -468,11 +468,18 @@ def forward( return outputs -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->MarkupLM +MARKUPLM_SELF_ATTENTION_CLASSES = { + "eager": MarkupLMSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->MarkupLM,BERT->MARKUPLM class MarkupLMAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = MarkupLMSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = MARKUPLM_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = MarkupLMSelfOutput(config) self.pruned_heads = set() @@ -797,7 +804,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P MARKUPLM_START_DOCSTRING, ) class MarkupLMModel(MarkupLMPreTrainedModel): - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->MarkupLM + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->MarkupLM def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 86f28942893399..adec5647a28134 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -368,11 +368,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm +REALM_SELF_ATTENTION_CLASSES = { + "eager": RealmSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm,BERT->REALM class RealmAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = RealmSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = RealmSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index e1f15722e43bdf..640139212081ca 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -294,11 +294,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta +ROBERTA_SELF_ATTENTION_CLASSES = { + "eager": RobertaSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta,BERT->ROBERTA class RobertaAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = RobertaSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = RobertaSelfOutput(config) self.pruned_heads = set() @@ -688,7 +695,7 @@ class RobertaModel(RobertaPreTrainedModel): """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Roberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -721,7 +728,7 @@ class PreTrainedModel output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 51850c9af1d5c0..739e60b550baf3 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -431,11 +431,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert +ROC_BERT_SELF_ATTENTION_CLASSES = { + "eager": RoCBertSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert,BERT->ROC_BERT class RoCBertAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = RoCBertSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = ROC_BERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = RoCBertSelfOutput(config) self.pruned_heads = set() @@ -759,7 +766,6 @@ def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: return prediction_scores -# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->RoCBert,bert->roc_bert class RoCBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -880,7 +886,7 @@ class RoCBertModel(RoCBertPreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->RoCBert + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->RoCBert def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index b643601d0ebd49..fa546e1201346a 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -245,11 +245,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Splinter +SPLINTER_SELF_ATTENTION_CLASSES = { + "eager": SplinterSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Splinter,BERT->SPLINTER class SplinterAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = SplinterSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = SPLINTER_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = SplinterSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 0d829aaee63582..48c6898811d1e0 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -295,11 +295,18 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta +XLM_ROBERTA_SELF_ATTENTION_CLASSES = { + "eager": XLMRobertaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta,ROBERTA->XLM_ROBERTA class XLMRobertaAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = XLMRobertaSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = XLM_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = XLMRobertaSelfOutput(config) self.pruned_heads = set() @@ -690,7 +697,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel): """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->XLMRoberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -723,7 +730,7 @@ class PreTrainedModel output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 1c17652dfa0cb4..d8994e335b1242 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -664,7 +664,7 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel): an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->XLMRobertaXL def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -697,7 +697,7 @@ class PreTrainedModel output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) - # Copied from transformers.models.bert.modeling_bert.BertModel.forward + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 2bf76a40d46974..32e34ef6683817 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -783,7 +783,7 @@ class XmodModel(XmodPreTrainedModel): """ - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Xmod def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index bdc812ff27657b..ff9a62802048e9 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -18,7 +18,14 @@ from transformers import BertConfig, is_torch_available from transformers.models.auto import get_values -from transformers.testing_utils import CaptureLogger, require_torch, require_torch_accelerator, slow, torch_device +from transformers.testing_utils import ( + CaptureLogger, + require_torch, + require_torch_accelerator, + require_torch_sdpa, + slow, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -621,6 +628,79 @@ def test_torchscript_device_change(self): loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device) loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) + # This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True. + # TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True. + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + if len(self.all_generative_model_classes) == 0: + self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") + + for model_class in self.all_generative_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + # low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + # low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + @require_torch class BertModelIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 1c099a4035b440..061c0000ceedfe 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3603,12 +3603,14 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(model_eager.config._attn_implementation == "eager") for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: raise ValueError("The eager model should not have SDPA attention layers") has_sdpa = False for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: has_sdpa = True break if not has_sdpa and model_sdpa.config.model_type != "falcon": @@ -3691,19 +3693,21 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): decoder_input_ids = decoder_input_ids.to(torch_device) # TODO: never an `attention_mask` arg here? - other_inputs = { + processed_inputs = { + model.main_input_name: dummy_input, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": dummy_attention_mask, "output_hidden_states": True, } else: - other_inputs = { + processed_inputs = { + model.main_input_name: dummy_input, "output_hidden_states": True, } # Otherwise fails for e.g. WhisperEncoderModel if "attention_mask" in inspect.signature(model_eager.forward).parameters: - other_inputs["attention_mask"] = dummy_attention_mask + processed_inputs["attention_mask"] = dummy_attention_mask # TODO: test gradients as well (& for FA2 as well!) with torch.no_grad(): @@ -3712,8 +3716,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): enable_math=True, enable_mem_efficient=enable_kernels, ): - outputs_eager = model_eager(dummy_input, **other_inputs) - outputs_sdpa = model_sdpa(dummy_input, **other_inputs) + prepared_inputs = self._prepare_for_class(processed_inputs, model_class) + outputs_eager = model_eager(**prepared_inputs) + outputs_sdpa = model_sdpa(**prepared_inputs) logits_eager = ( outputs_eager.hidden_states[-1] @@ -3799,6 +3804,7 @@ def test_sdpa_can_dispatch_on_flash(self): self.skipTest(f"{model_class.__name__} does not support SDPA") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + inputs_dict = self._prepare_for_class(inputs_dict, model_class) if config.model_type in ["llava", "llava_next", "vipllava"]: self.skipTest("Llava-like models currently (transformers==4.39.1) requires an attention_mask input") if config.model_type in ["idefics"]: @@ -3867,12 +3873,14 @@ def test_eager_matches_sdpa_generate(self): self.assertTrue(model_eager.config._attn_implementation == "eager") for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: raise ValueError("The eager model should not have SDPA attention layers") has_sdpa = False for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: has_sdpa = True break if not has_sdpa: From 77ff304d290d17c9cca4aa2f03330489812e423d Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Fri, 26 Apr 2024 11:52:09 -0400 Subject: [PATCH 03/18] Remove skipping logic now that set_epoch exists (#30501) * Remove skipping logic now that set_epoch exists * Working version, clean --- src/transformers/trainer.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 26ab0877d0c171..1d3c164984ea1c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -96,7 +96,6 @@ distributed_broadcast_scalars, distributed_concat, find_batch_size, - get_dataloader_sampler, get_model_param_count, get_module_class_from_name, get_parameter_names, @@ -2137,24 +2136,6 @@ def _inner_training_loop( self.control = self.callback_handler.on_train_begin(args, self.state, self.control) - # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. - if not args.ignore_data_skip: - for epoch in range(epochs_trained): - sampler = get_dataloader_sampler(train_dataloader) - sampler_kinds = [RandomSampler] - if version.parse(accelerate_version) > version.parse("0.23.0"): - sampler_kinds.append(SeedableRandomSampler) - is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) - if not is_random_sampler: - # We just need to begin an iteration to create the randomization of the sampler. - for _ in train_dataloader: - break - else: - # Otherwise we need to call the whooooole sampler cause there is some random operation added - # AT THE VERY END! - sampler = sampler if sampler is not None else [] - _ = list(sampler) - total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): epoch_iterator = train_dataloader From aafa7ce72b65c730788c122a72a974e464409e9a Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:55:24 +0100 Subject: [PATCH 04/18] [`DETR`] Remove timm hardcoded logic in modeling files (#29038) * Enable instantiating model with pretrained backbone weights * Clarify pretrained import * Use load_backbone instead * Add backbone_kwargs to config * Fix up * Add tests * Tidy up * Enable instantiating model with pretrained backbone weights * Update tests so backbone checkpoint isn't passed in * Clarify pretrained import * Update configs - docs and validation check * Update src/transformers/utils/backbone_utils.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Clarify exception message * Update config init in tests * Add test for when use_timm_backbone=True * Use load_backbone instead * Add use_timm_backbone to the model configs * Add backbone_kwargs to config * Pass kwargs to constructors * Draft * Fix tests * Add back timm - weight naming * More tidying up * Whoops * Tidy up * Handle when kwargs are none * Update tests * Revert test changes * Deformable detr test - don't use default * Don't mutate; correct model attributes * Add some clarifying comments * nit - grammar is hard --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../configuration_conditional_detr.py | 14 ++++-- .../modeling_conditional_detr.py | 18 ++++--- .../configuration_deformable_detr.py | 12 ++++- .../modeling_deformable_detr.py | 49 ++++++++++++------- .../models/detr/configuration_detr.py | 14 +++++- src/transformers/models/detr/modeling_detr.py | 16 ++++-- src/transformers/models/dpt/modeling_dpt.py | 6 +-- .../configuration_table_transformer.py | 14 +++++- .../modeling_table_transformer.py | 14 ++++-- .../timm_backbone/modeling_timm_backbone.py | 7 ++- .../test_modeling_conditional_detr.py | 10 ++++ .../test_modeling_deformable_detr.py | 11 ++++- tests/models/detr/test_modeling_detr.py | 11 +++++ .../test_modeling_table_transformer.py | 8 +++ 14 files changed, 156 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py index 945e5edb32ad30..4f95de3582f082 100644 --- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py +++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py @@ -192,10 +192,16 @@ def __init__( if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") - if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: - raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index d8ff371fad77d1..d723d3866ea416 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -338,12 +338,12 @@ def replace_batch_norm(model): replace_batch_norm(module) -# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder +# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->ConditionalDetr class ConditionalDetrConvEncoder(nn.Module): """ Convolutional backbone, using either the AutoBackbone API or one from the timm library. - nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above. + nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above. """ @@ -352,17 +352,23 @@ def __init__(self, config): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 6d32f6220df586..3f3ffff69ff2e9 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -212,7 +212,16 @@ def __init__( if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -220,6 +229,7 @@ def __init__( backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config self.num_channels = num_channels diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index c0ac7cffc7ab44..7b2bbb9b1242c9 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -88,11 +88,31 @@ def load_cuda_kernels(): if is_vision_available(): from transformers.image_transforms import center_to_corners_format + if is_accelerate_available(): from accelerate import PartialState from accelerate.utils import reduce +if is_timm_available(): + from timm import create_model + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DeformableDetrConfig" +_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" + +DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "sensetime/deformable-detr", + # See all Deformable DETR models at https://huggingface.co/models?filter=deformable-detr +] + + class MultiScaleDeformableAttentionFunction(Function): @staticmethod def forward( @@ -141,21 +161,6 @@ def backward(context, grad_output): return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - -if is_timm_available(): - from timm import create_model - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "DeformableDetrConfig" -_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" - - -from ..deprecated._archive_maps import DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402 - - @dataclass class DeformableDetrDecoderOutput(ModelOutput): """ @@ -420,17 +425,23 @@ def __init__(self, config): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 9b9b5afacd0b7f..db180ef1d41fed 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -193,7 +193,16 @@ def __init__( if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -201,8 +210,9 @@ def __init__( backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + backbone = None # set timm attributes to None - dilation, backbone, use_pretrained_backbone = None, None, None + dilation = None self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index d7fcdfc5bc7e83..0da702db8b67e2 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -49,9 +49,11 @@ if is_scipy_available(): from scipy.optimize import linear_sum_assignment + if is_timm_available(): from timm import create_model + if is_vision_available(): from transformers.image_transforms import center_to_corners_format @@ -345,17 +347,23 @@ def __init__(self, config): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index aad3330279f051..ef6c8bb853abda 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1075,10 +1075,10 @@ def __init__(self, config): super().__init__(config) self.backbone = None - if config.backbone_config is not None and config.is_hybrid is False: - self.backbone = load_backbone(config) - else: + if config.is_hybrid or config.backbone_config is None: self.dpt = DPTModel(config, add_pooling_layer=False) + else: + self.backbone = load_backbone(config) # Neck self.neck = DPTNeck(config) diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index 9a2ff6bbab3b24..4963396024a57e 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -193,7 +193,16 @@ def __init__( if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -201,8 +210,9 @@ def __init__( backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + backbone = None # set timm attributes to None - dilation, backbone, use_pretrained_backbone = None, None, None + dilation = None self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 8e577a65a5fe00..9a684ee121ddca 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -279,17 +279,23 @@ def __init__(self, config): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py index 0c6fe67b75731f..e8e0b28e042d6f 100644 --- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py +++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py @@ -63,12 +63,13 @@ def __init__(self, config, **kwargs): # We just take the final layer by default. This matches the default for the transformers models. out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,) + in_chans = kwargs.pop("in_chans", config.num_channels) self._backbone = timm.create_model( config.backbone, pretrained=pretrained, # This is currently not possible for transformer architectures. features_only=config.features_only, - in_chans=config.num_channels, + in_chans=in_chans, out_indices=out_indices, **kwargs, ) @@ -79,7 +80,9 @@ def __init__(self, config, **kwargs): # These are used to control the output of the model when called. If output_hidden_states is True, then # return_layers is modified to include all layers. - self._return_layers = self._backbone.return_layers + self._return_layers = { + layer["module"]: str(layer["index"]) for layer in self._backbone.feature_info.get_dicts() + } self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)} super()._init_backbone(config) diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index d1152ed8622b9c..c3f77614b4dd31 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -444,7 +444,9 @@ def test_different_timm_backbone(self): # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -460,6 +462,14 @@ def test_different_timm_backbone(self): self.model_tester.num_labels, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs) diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 7a83c4f1ed80a8..36be099790a45b 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -521,8 +521,9 @@ def test_different_timm_backbone(self): # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" - config.use_timm_backbone = True config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -538,6 +539,14 @@ def test_different_timm_backbone(self): self.model_tester.num_labels, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.deformable_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4) self.assertTrue(outputs) diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 59b071e031aa8a..27092c626dd46d 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -444,6 +444,9 @@ def test_different_timm_backbone(self): # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -459,6 +462,14 @@ def test_different_timm_backbone(self): self.model_tester.num_labels + 1, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "DetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs) diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 79da1d191063ab..d323083eb7f1d4 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -456,6 +456,9 @@ def test_different_timm_backbone(self): # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -471,6 +474,11 @@ def test_different_timm_backbone(self): self.model_tester.num_labels + 1, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs) From 38b53da38af231b0af967d15ca29c52470e402d5 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Fri, 26 Apr 2024 17:06:03 +0100 Subject: [PATCH 05/18] [examples] update whisper fine-tuning (#29938) * [examples] update whisper fine-tuning * deprecate forced/suppress tokens * item assignment * update readme * final fix --- examples/pytorch/speech-recognition/README.md | 9 ++-- .../run_speech_recognition_seq2seq.py | 45 ++++++++++++++----- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md index b9cab9513bd446..4990219f42a143 100644 --- a/examples/pytorch/speech-recognition/README.md +++ b/examples/pytorch/speech-recognition/README.md @@ -368,6 +368,7 @@ python run_speech_recognition_seq2seq.py \ --dataset_name="mozilla-foundation/common_voice_11_0" \ --dataset_config_name="hi" \ --language="hindi" \ + --task="transcribe" \ --train_split_name="train+validation" \ --eval_split_name="test" \ --max_steps="5000" \ @@ -384,12 +385,10 @@ python run_speech_recognition_seq2seq.py \ --save_steps="1000" \ --generation_max_length="225" \ --preprocessing_num_workers="16" \ - --length_column_name="input_length" \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ --gradient_checkpointing \ - --group_by_length \ --fp16 \ --overwrite_output_dir \ --do_train \ @@ -399,7 +398,8 @@ python run_speech_recognition_seq2seq.py \ ``` On a single V100, training should take approximately 8 hours, with a final cross-entropy loss of **1e-4** and word error rate of **32.6%**. -If training on a different language, you should be sure to change the `language` argument. The `language` argument should be omitted for English speech recognition. +If training on a different language, you should be sure to change the `language` argument. The `language` and `task` +arguments should be omitted for English speech recognition. #### Multi GPU Whisper Training The following example shows how to fine-tune the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using 2 GPU devices in half-precision: @@ -410,6 +410,7 @@ torchrun \ --dataset_name="mozilla-foundation/common_voice_11_0" \ --dataset_config_name="hi" \ --language="hindi" \ + --task="transcribe" \ --train_split_name="train+validation" \ --eval_split_name="test" \ --max_steps="5000" \ @@ -425,12 +426,10 @@ torchrun \ --save_steps="1000" \ --generation_max_length="225" \ --preprocessing_num_workers="16" \ - --length_column_name="input_length" \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ --gradient_checkpointing \ - --group_by_length \ --fp16 \ --overwrite_output_dir \ --do_train \ diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 3a596e2cb7bddd..f352954d80aefa 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -119,17 +119,16 @@ class ModelArguments: ) forced_decoder_ids: List[List[int]] = field( default=None, - metadata={ + metadata={"help": "Deprecated. Please use the `language` and `task` arguments instead."}, + ) + suppress_tokens: List[int] = field( + default=None, metadata={ "help": ( - "A list of pairs of integers which indicates a mapping from generation indices to token indices " - "that will be forced before sampling. For example, [[0, 123]] means the first generated token " - "will always be a token of index 123." + "Deprecated. The use of `suppress_tokens` should not be required for the majority of fine-tuning examples." + "Should you need to use `suppress_tokens`, please manually update them in the fine-tuning script directly." ) }, ) - suppress_tokens: List[int] = field( - default=None, metadata={"help": "A list of tokens that will be suppressed at generation."} - ) apply_spec_augment: bool = field( default=False, metadata={ @@ -400,8 +399,6 @@ def main(): trust_remote_code=model_args.trust_remote_code, ) - config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens}) - # SpecAugment for whisper models if getattr(config, "model_type", None) == "whisper": config.update({"apply_spec_augment": model_args.apply_spec_augment}) @@ -440,9 +437,35 @@ def main(): model.freeze_encoder() model.model.encoder.gradient_checkpointing = False - if data_args.language is not None: - # We only need to set the task id when the language is specified (i.e. in a multilingual setting) + if hasattr(model.generation_config, "is_multilingual") and model.generation_config.is_multilingual: + # We only need to set the language and task ids in a multilingual setting tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task) + model.generation_config.update( + **{ + "language": data_args.language, + "task": data_args.task, + } + ) + elif data_args.language is not None: + raise ValueError( + "Setting language token for an English-only checkpoint is not permitted. The language argument should " + "only be set for multilingual checkpoints." + ) + + # TODO (Sanchit): deprecate these arguments in v4.41 + if model_args.forced_decoder_ids is not None: + logger.warning( + "The use of `forced_decoder_ids` is deprecated and will be removed in v4.41." + "Please use the `language` and `task` arguments instead" + ) + model.generation_config.forced_decoder_ids = model_args.forced_decoder_ids + + if model_args.suppress_tokens is not None: + logger.warning( + "The use of `suppress_tokens` is deprecated and will be removed in v4.41." + "Should you need `suppress_tokens`, please manually set them in the fine-tuning script." + ) + model.generation_config.suppress_tokens = model_args.suppress_tokens # 6. Resample speech dataset if necessary dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate From e7d52a10d721f4475c810d403b1e71689d4b94b9 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 26 Apr 2024 18:04:41 +0100 Subject: [PATCH 06/18] Fix GroundingDINO, DPR after BERT SDPA update (#30506) Fix GroundingDINO, DPR after BET SDPA update --- docs/source/en/perf_infer_gpu_one.md | 1 + src/transformers/models/dpr/modeling_dpr.py | 2 ++ .../models/grounding_dino/modeling_grounding_dino.py | 4 +++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 64583e4badf044..de49d4427b5687 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -194,6 +194,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) +* [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel) diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 0a45ec75207c29..928f2b93118ac3 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -142,6 +142,8 @@ class DPRReaderOutput(ModelOutput): class DPRPreTrainedModel(PreTrainedModel): + _supports_sdpa = True + def _init_weights(self, module): """Initialize the weights""" if isinstance(module, nn.Linear): diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 83009c92504211..da8dd29a5cb54d 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2113,7 +2113,9 @@ def __init__(self, config: GroundingDinoConfig): ) # Create text backbone - self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False) + self.text_backbone = AutoModel.from_config( + config.text_config, add_pooling_layer=False, attn_implementation=config._attn_implementation + ) self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: From c793b26f2ec9d72970457a1d78372d13bd7b8207 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 26 Apr 2024 18:21:47 +0100 Subject: [PATCH 07/18] load_image - decode b64encode and encodebytes strings (#30192) * Decode b64encode and encodebytes strings * Remove conditional encode -- image is always a string --- src/transformers/image_utils.py | 2 +- tests/utils/test_image_utils.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index e4a55b3455a344..7d71fc982b4390 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -320,7 +320,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = # Try to load as base64 try: - b64 = base64.b64decode(image, validate=True) + b64 = base64.decodebytes(image.encode()) image = PIL.Image.open(BytesIO(b64)) except Exception as e: raise ValueError( diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py index d6bc9a37585899..f360c4bb82532d 100644 --- a/tests/utils/test_image_utils.py +++ b/tests/utils/test_image_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import codecs import os import tempfile import unittest @@ -544,6 +545,23 @@ def test_load_img_base64(self): self.assertEqual(img_arr.shape, (64, 32, 3)) + def test_load_img_base64_encoded_bytes(self): + try: + tmp_file = tempfile.mktemp() + with open(tmp_file, "wb") as f: + http_get( + "https://huggingface.co/datasets/hf-internal-testing/dummy-base64-images/raw/main/image_2.txt", f + ) + + with codecs.open(tmp_file, encoding="unicode_escape") as b64: + img = load_image(b64.read()) + img_arr = np.array(img) + + finally: + os.remove(tmp_file) + + self.assertEqual(img_arr.shape, (256, 256, 3)) + def test_load_img_rgba(self): # we use revision="refs/pr/1" until the PR is merged # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 From 6d4cabda2614d86357092585b416c4d08be73382 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Fri, 26 Apr 2024 20:40:12 +0200 Subject: [PATCH 08/18] [SegGPT] Fix seggpt image processor (#29550) * Fixed SegGptImageProcessor to handle 2D and 3D prompt mask inputs * Added new test to check prompt mask equivalence * New proposal * Better proposal * Removed unnecessary method * Updated seggpt docs * Introduced do_convert_rgb * nits --- docs/source/en/model_doc/seggpt.md | 5 +- .../models/seggpt/image_processing_seggpt.py | 103 ++++++++---------- .../seggpt/test_image_processing_seggpt.py | 83 +++++++++++++- tests/models/seggpt/test_modeling_seggpt.py | 22 +++- 4 files changed, 148 insertions(+), 65 deletions(-) diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md index f821fc14a08c54..5a68d38fc98b6c 100644 --- a/docs/source/en/model_doc/seggpt.md +++ b/docs/source/en/model_doc/seggpt.md @@ -26,7 +26,8 @@ The abstract from the paper is the following: Tips: - One can use [`SegGptImageProcessor`] to prepare image input, prompt and mask to the model. -- It's highly advisable to pass `num_labels` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case. +- One can either use segmentation maps or RGB images as prompt masks. If using the latter make sure to set `do_convert_rgb=False` in the `preprocess` method. +- It's highly advisable to pass `num_labels` when using `segmetantion_maps` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case. - When doing inference with [`SegGptForImageSegmentation`] if your `batch_size` is greater than 1 you can use feature ensemble across your images by passing `feature_ensemble=True` in the forward method. Here's how to use the model for one-shot semantic segmentation: @@ -53,7 +54,7 @@ mask_prompt = ds[29]["label"] inputs = image_processor( images=image_input, prompt_images=image_prompt, - prompt_masks=mask_prompt, + segmentation_maps=mask_prompt, num_labels=num_labels, return_tensors="pt" ) diff --git a/src/transformers/models/seggpt/image_processing_seggpt.py b/src/transformers/models/seggpt/image_processing_seggpt.py index 80fb94cdc7aaf4..1e4a5e23d093e8 100644 --- a/src/transformers/models/seggpt/image_processing_seggpt.py +++ b/src/transformers/models/seggpt/image_processing_seggpt.py @@ -26,19 +26,21 @@ ChannelDimension, ImageInput, PILImageResampling, - get_channel_dimension_axis, infer_channel_dimension_format, is_scaled_image, make_list_of_images, to_numpy_array, valid_images, ) -from ...utils import TensorType, is_torch_available, logging, requires_backends +from ...utils import TensorType, is_torch_available, is_vision_available, logging, requires_backends if is_torch_available(): import torch +if is_vision_available(): + pass + logger = logging.get_logger(__name__) @@ -65,29 +67,10 @@ def build_palette(num_labels: int) -> List[Tuple[int, int]]: return color_list -def get_num_channels(image: np.ndarray, input_data_format: ChannelDimension) -> int: - if image.ndim == 2: - return 0 - - channel_idx = get_channel_dimension_axis(image, input_data_format) - return image.shape[channel_idx] - - def mask_to_rgb( - mask: np.ndarray, - palette: Optional[List[Tuple[int, int]]] = None, - input_data_format: Optional[ChannelDimension] = None, - data_format: Optional[ChannelDimension] = None, + mask: np.ndarray, palette: Optional[List[Tuple[int, int]]] = None, data_format: Optional[ChannelDimension] = None ) -> np.ndarray: - if input_data_format is None and mask.ndim > 2: - input_data_format = infer_channel_dimension_format(mask) - - data_format = data_format if data_format is not None else input_data_format - - num_channels = get_num_channels(mask, input_data_format) - - if num_channels == 3: - return to_channel_dimension_format(mask, data_format, input_data_format) if data_format is not None else mask + data_format = data_format if data_format is not None else ChannelDimension.FIRST if palette is not None: height, width = mask.shape @@ -109,9 +92,7 @@ def mask_to_rgb( else: rgb_mask = np.repeat(mask[None, ...], 3, axis=0) - return ( - to_channel_dimension_format(rgb_mask, data_format, input_data_format) if data_format is not None else rgb_mask - ) + return to_channel_dimension_format(rgb_mask, data_format) class SegGptImageProcessor(BaseImageProcessor): @@ -143,6 +124,9 @@ class SegGptImageProcessor(BaseImageProcessor): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the prompt mask to RGB format. Can be overridden by the `do_convert_rgb` parameter in the + `preprocess` method. """ model_input_names = ["pixel_values"] @@ -157,6 +141,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) @@ -170,6 +155,7 @@ def __init__( self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_convert_rgb = do_convert_rgb def get_palette(self, num_labels: int) -> List[Tuple[int, int]]: """Build a palette to map the prompt mask from a single channel to a 3 channel RGB. @@ -188,13 +174,12 @@ def mask_to_rgb( image: np.ndarray, palette: Optional[List[Tuple[int, int]]] = None, data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: - """Convert a mask to RGB format. + """Converts a segmentation map to RGB format. Args: image (`np.ndarray`): - Mask to convert to RGB format. If the mask is already in RGB format, it will be passed through. + Segmentation map with dimensions (height, width) where pixel values represent the class index. palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`): Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel dimension. @@ -203,21 +188,11 @@ def mask_to_rgb( image is used. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. Returns: `np.ndarray`: The mask in RGB format. """ - return mask_to_rgb( - image, - palette=palette, - data_format=data_format, - input_data_format=input_data_format, - ) + return mask_to_rgb(image, palette=palette, data_format=data_format) # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC def resize( @@ -271,7 +246,6 @@ def resize( def _preprocess_step( self, images: ImageInput, - is_mask: bool = False, do_resize: Optional[bool] = None, size: Dict[str, int] = None, resample: PILImageResampling = None, @@ -282,6 +256,7 @@ def _preprocess_step( image_std: Optional[Union[float, List[float]]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: Optional[bool] = None, num_labels: Optional[int] = None, **kwargs, ): @@ -292,9 +267,6 @@ def _preprocess_step( images (`ImageInput`): Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. - is_mask (`bool`, *optional*, defaults to `False`): - Whether the image is a mask. If True, the image is converted to RGB using the palette if - `self.num_labels` is specified otherwise RGB is achieved by duplicating the channel. do_resize (`bool`, *optional*, defaults to `self.do_resize`): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to `self.size`): @@ -331,6 +303,10 @@ def _preprocess_step( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built + to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated + across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format. num_labels: (`int`, *optional*): Number of classes in the segmentation task (excluding the background). If specified, a palette will be built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx @@ -340,6 +316,7 @@ def _preprocess_step( do_resize = do_resize if do_resize is not None else self.do_resize do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb resample = resample if resample is not None else self.resample rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean @@ -348,7 +325,8 @@ def _preprocess_step( size = size if size is not None else self.size size_dict = get_size_dict(size) - images = make_list_of_images(images) + # If segmentation map is passed we expect 2D images + images = make_list_of_images(images, expected_ndims=2 if do_convert_rgb else 3) if not valid_images(images): raise ValueError( @@ -374,11 +352,11 @@ def _preprocess_step( " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." ) - if input_data_format is None and not is_mask: + if input_data_format is None and not do_convert_rgb: # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) - if is_mask: + if do_convert_rgb: palette = self.get_palette(num_labels) if num_labels is not None else None # Since this is the input for the next transformations its format should be the same as the input_data_format images = [ @@ -423,6 +401,7 @@ def preprocess( do_normalize: Optional[bool] = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, num_labels: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, @@ -440,9 +419,12 @@ def preprocess( Prompt image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. prompt_masks (`ImageInput`): - Prompt mask from prompt image to _preprocess. Expects a single or batch of masks. If the mask masks are - a single channel then it will be converted to RGB using the palette if `self.num_labels` is specified - or by just repeating the channel if not. If the mask is already in RGB format, it will be passed through. + Prompt mask from prompt image to _preprocess that specify prompt_masks value in the preprocessed output. + Can either be in the format of segmentation maps (no channels) or RGB images. If in the format of + RGB images, `do_convert_rgb` should be set to `False`. If in the format of segmentation maps, `num_labels` + specifying `num_labels` is recommended to build a palette to map the prompt mask from a single channel to + a 3 channel RGB. If `num_labels` is not specified, the prompt mask will be duplicated across the channel + dimension. do_resize (`bool`, *optional*, defaults to `self.do_resize`): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to `self.size`): @@ -461,6 +443,16 @@ def preprocess( Image mean to use if `do_normalize` is set to `True`. image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): Image standard deviation to use if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built + to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated + across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format. + num_labels: (`int`, *optional*): + Number of classes in the segmentation task (excluding the background). If specified, a palette will be + built, assuming that class_idx 0 is the background, to map the prompt mask from a plain segmentation map + with no channels to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed + through as is if it is already in RGB format (if `do_convert_rgb` is false) or being duplicated + across the channel dimension. return_tensors (`str` or `TensorType`, *optional*): The type of tensors to return. Can be one of: - Unset: Return a list of `np.ndarray`. @@ -479,11 +471,6 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - num_labels: (`int`, *optional*): - Number of classes in the segmentation task (excluding the background). If specified, a palette will be - built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx - channel to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed - through as is if it is already in RGB format or being duplicated across the channel dimension. """ if all(v is None for v in [images, prompt_images, prompt_masks]): raise ValueError("At least one of images, prompt_images, prompt_masks must be specified.") @@ -502,6 +489,7 @@ def preprocess( do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, + do_convert_rgb=False, data_format=data_format, input_data_format=input_data_format, **kwargs, @@ -521,6 +509,7 @@ def preprocess( do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, + do_convert_rgb=False, data_format=data_format, input_data_format=input_data_format, **kwargs, @@ -531,7 +520,6 @@ def preprocess( if prompt_masks is not None: prompt_masks = self._preprocess_step( prompt_masks, - is_mask=True, do_resize=do_resize, size=size, resample=PILImageResampling.NEAREST, @@ -540,9 +528,10 @@ def preprocess( do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, + do_convert_rgb=do_convert_rgb, + num_labels=num_labels, data_format=data_format, input_data_format=input_data_format, - num_labels=num_labels, **kwargs, ) diff --git a/tests/models/seggpt/test_image_processing_seggpt.py b/tests/models/seggpt/test_image_processing_seggpt.py index 46694d6636ea05..04cefb70d0efb4 100644 --- a/tests/models/seggpt/test_image_processing_seggpt.py +++ b/tests/models/seggpt/test_image_processing_seggpt.py @@ -30,6 +30,8 @@ from transformers.models.seggpt.modeling_seggpt import SegGptImageSegmentationOutput if is_vision_available(): + from PIL import Image + from transformers import SegGptImageProcessor @@ -147,7 +149,7 @@ def test_mask_equivalence(self): mask_rgb = mask_binary.convert("RGB") inputs_binary = image_processor(images=None, prompt_masks=mask_binary, return_tensors="pt") - inputs_rgb = image_processor(images=None, prompt_masks=mask_rgb, return_tensors="pt") + inputs_rgb = image_processor(images=None, prompt_masks=mask_rgb, return_tensors="pt", do_convert_rgb=False) self.assertTrue((inputs_binary["prompt_masks"] == inputs_rgb["prompt_masks"]).all().item()) @@ -196,7 +198,11 @@ def test_pixel_values(self): image_processor = SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large") inputs = image_processor( - images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt" + images=input_image, + prompt_images=prompt_image, + prompt_masks=prompt_mask, + return_tensors="pt", + do_convert_rgb=False, ) # Verify pixel values @@ -229,3 +235,76 @@ def test_pixel_values(self): torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4) ) self.assertTrue(torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)) + + def test_prompt_mask_equivalence(self): + image_processor = self.image_processing_class(**self.image_processor_dict) + image_size = self.image_processor_tester.image_size + + # Single Mask Examples + expected_single_shape = [1, 3, image_size, image_size] + + # Single Semantic Map (2D) + image_np_2d = np.ones((image_size, image_size)) + image_pt_2d = torch.ones((image_size, image_size)) + image_pil_2d = Image.fromarray(image_np_2d) + + inputs_np_2d = image_processor(images=None, prompt_masks=image_np_2d, return_tensors="pt") + inputs_pt_2d = image_processor(images=None, prompt_masks=image_pt_2d, return_tensors="pt") + inputs_pil_2d = image_processor(images=None, prompt_masks=image_pil_2d, return_tensors="pt") + + self.assertTrue((inputs_np_2d["prompt_masks"] == inputs_pt_2d["prompt_masks"]).all().item()) + self.assertTrue((inputs_np_2d["prompt_masks"] == inputs_pil_2d["prompt_masks"]).all().item()) + self.assertEqual(list(inputs_np_2d["prompt_masks"].shape), expected_single_shape) + + # Single RGB Images (3D) + image_np_3d = np.ones((3, image_size, image_size)) + image_pt_3d = torch.ones((3, image_size, image_size)) + image_pil_3d = Image.fromarray(image_np_3d.transpose(1, 2, 0).astype(np.uint8)) + + inputs_np_3d = image_processor( + images=None, prompt_masks=image_np_3d, return_tensors="pt", do_convert_rgb=False + ) + inputs_pt_3d = image_processor( + images=None, prompt_masks=image_pt_3d, return_tensors="pt", do_convert_rgb=False + ) + inputs_pil_3d = image_processor( + images=None, prompt_masks=image_pil_3d, return_tensors="pt", do_convert_rgb=False + ) + + self.assertTrue((inputs_np_3d["prompt_masks"] == inputs_pt_3d["prompt_masks"]).all().item()) + self.assertTrue((inputs_np_3d["prompt_masks"] == inputs_pil_3d["prompt_masks"]).all().item()) + self.assertEqual(list(inputs_np_3d["prompt_masks"].shape), expected_single_shape) + + # Batched Examples + expected_batched_shape = [2, 3, image_size, image_size] + + # Batched Semantic Maps (3D) + image_np_2d_batched = np.ones((2, image_size, image_size)) + image_pt_2d_batched = torch.ones((2, image_size, image_size)) + + inputs_np_2d_batched = image_processor(images=None, prompt_masks=image_np_2d_batched, return_tensors="pt") + inputs_pt_2d_batched = image_processor(images=None, prompt_masks=image_pt_2d_batched, return_tensors="pt") + + self.assertTrue((inputs_np_2d_batched["prompt_masks"] == inputs_pt_2d_batched["prompt_masks"]).all().item()) + self.assertEqual(list(inputs_np_2d_batched["prompt_masks"].shape), expected_batched_shape) + + # Batched RGB images + image_np_4d = np.ones((2, 3, image_size, image_size)) + image_pt_4d = torch.ones((2, 3, image_size, image_size)) + + inputs_np_4d = image_processor( + images=None, prompt_masks=image_np_4d, return_tensors="pt", do_convert_rgb=False + ) + inputs_pt_4d = image_processor( + images=None, prompt_masks=image_pt_4d, return_tensors="pt", do_convert_rgb=False + ) + + self.assertTrue((inputs_np_4d["prompt_masks"] == inputs_pt_4d["prompt_masks"]).all().item()) + self.assertEqual(list(inputs_np_4d["prompt_masks"].shape), expected_batched_shape) + + # Comparing Single and Batched Examples + self.assertTrue((inputs_np_2d["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item()) + self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_2d["prompt_masks"][0]).all().item()) + self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item()) + self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_4d["prompt_masks"][0]).all().item()) + self.assertTrue((inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item()) diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py index d43d4304532431..efa0231c1e817a 100644 --- a/tests/models/seggpt/test_modeling_seggpt.py +++ b/tests/models/seggpt/test_modeling_seggpt.py @@ -363,7 +363,11 @@ def test_one_shot_inference(self): prompt_mask = masks[0] inputs = image_processor( - images=input_image, prompt_images=prompt_image, prompt_masks=prompt_mask, return_tensors="pt" + images=input_image, + prompt_images=prompt_image, + prompt_masks=prompt_mask, + return_tensors="pt", + do_convert_rgb=False, ) inputs = inputs.to(torch_device) @@ -404,7 +408,11 @@ def test_few_shot_inference(self): prompt_masks = [masks[0], masks[2]] inputs = image_processor( - images=input_images, prompt_images=prompt_images, prompt_masks=prompt_masks, return_tensors="pt" + images=input_images, + prompt_images=prompt_images, + prompt_masks=prompt_masks, + return_tensors="pt", + do_convert_rgb=False, ) inputs = {k: v.to(torch_device) for k, v in inputs.items()} @@ -437,10 +445,16 @@ def test_one_shot_with_label(self): prompt_mask = masks[0] inputs = image_processor( - images=input_image, prompt_masks=prompt_mask, prompt_images=prompt_image, return_tensors="pt" + images=input_image, + prompt_masks=prompt_mask, + prompt_images=prompt_image, + return_tensors="pt", + do_convert_rgb=False, ).to(torch_device) - labels = image_processor(images=None, prompt_masks=label, return_tensors="pt")["prompt_masks"].to(torch_device) + labels = image_processor(images=None, prompt_masks=label, return_tensors="pt", do_convert_rgb=False)[ + "prompt_masks" + ].to(torch_device) bool_masked_pos = prepare_bool_masked_pos(model.config).to(torch_device) From 73014b561d5f88d728e46a57d346f516fefe3f2d Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:52:24 -0400 Subject: [PATCH 09/18] Fix link in dbrx.md (#30509) --- docs/source/en/model_doc/dbrx.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index 33435462b3e024..d60a4926eb1853 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -32,7 +32,7 @@ We used curriculum learning for pretraining, changing the data mix during traini More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm). -This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct). +This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx), though this may not be up to date. ## Usage Examples From 80126f98d86bad81c56b30e31a959bd6e644e8be Mon Sep 17 00:00:00 2001 From: Howard Liberty Date: Mon, 29 Apr 2024 02:03:26 -0700 Subject: [PATCH 10/18] Allow boolean FSDP options in fsdp_config (#30439) * Allow boolean FSDP options in fsdp_config * Use lower() to be safe --- src/transformers/training_args.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 12ae77908ebfae..18bf004efa05b0 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1840,12 +1840,12 @@ def __post_init__(self): ) prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH") os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper() - os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefetch", "false") + os.environ[f"{prefix}FORWARD_PREFETCH"] = str(self.fsdp_config.get("forward_prefetch", "false")).lower() - sync_module_states = self.fsdp_config.get("sync_module_states", "true") - cpu_ram_efficient_loading = self.fsdp_config.get("cpu_ram_efficient_loading", "false") + sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower() + cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower() - if str(sync_module_states).lower() == "false" and str(cpu_ram_efficient_loading).lower() == "true": + if sync_module_states == "false" and cpu_ram_efficient_loading == "true": # In this case, all the processes except the main process would have random weights leading # to unexpected behaviour during training, thus throwing error here to prevent it. raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`') @@ -1853,7 +1853,7 @@ def __post_init__(self): os.environ[f"{prefix}SYNC_MODULE_STATES"] = sync_module_states os.environ[f"{prefix}CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading - os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "true") + os.environ[f"{prefix}USE_ORIG_PARAMS"] = str(self.fsdp_config.get("use_orig_params", "true")).lower() if is_accelerate_available(): if not isinstance(self.accelerator_config, (AcceleratorConfig)): From e8acb70015288f719ebb1a37cdb935d650fdb6a1 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 29 Apr 2024 10:22:33 +0100 Subject: [PATCH 11/18] Pass attn_implementation when using AutoXXX.from_config (#30507) * Pass attn_implementation when using AutoXXX.from_config * Fix --- .../models/blip_2/modeling_blip_2.py | 16 ++++++++++++---- .../depth_anything/modeling_depth_anything.py | 4 +++- .../encoder_decoder/modeling_encoder_decoder.py | 4 ++-- src/transformers/models/fuyu/modeling_fuyu.py | 4 +++- .../models/idefics2/modeling_idefics2.py | 2 +- .../models/instructblip/modeling_instructblip.py | 8 ++++++-- src/transformers/models/rag/modeling_rag.py | 8 ++++++-- .../modeling_speech_encoder_decoder.py | 4 ++-- .../modeling_vision_encoder_decoder.py | 4 ++-- .../modeling_vision_text_dual_encoder.py | 6 ++++-- 10 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 935e041eb8360d..edd0d9a6d76133 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1194,9 +1194,13 @@ def __init__(self, config: Blip2Config): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config(config.text_config) + language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) else: - language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + language_model = AutoModelForSeq2SeqLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) # Update _tied_weights_keys using the base model used. if language_model._tied_weights_keys is not None: @@ -1549,9 +1553,13 @@ def __init__(self, config: Blip2Config): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config(config.text_config) + language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) else: - language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + language_model = AutoModelForSeq2SeqLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) # Update _tied_weights_keys using the base model used. if language_model._tied_weights_keys is not None: diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 788b0d911396f1..bed91ac2a482bc 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -367,7 +367,9 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): def __init__(self, config): super().__init__(config) - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = AutoBackbone.from_config( + config.backbone_config, attn_implementation=config._attn_implementation + ) self.neck = DepthAnythingNeck(config) self.head = DepthAnythingDepthEstimationHead(config) diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 16248fee64ce59..2b185cc14a03dd 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -209,12 +209,12 @@ def __init__( if encoder is None: from ..auto.modeling_auto import AutoModel - encoder = AutoModel.from_config(config.encoder) + encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) if decoder is None: from ..auto.modeling_auto import AutoModelForCausalLM - decoder = AutoModelForCausalLM.from_config(config.decoder) + decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) self.encoder = encoder self.decoder = decoder diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index 8e9a41954aee9c..bdaec5f868505c 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -149,7 +149,9 @@ def __init__(self, config: FuyuConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) self.vision_embed_tokens = nn.Linear( config.patch_size * config.patch_size * config.num_channels, config.hidden_size diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 28cd6155548ac7..7f61e95a9b8edf 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1476,7 +1476,7 @@ def __init__(self, config: Idefics2Config): self.vision_model = Idefics2VisionTransformer(config.vision_config) self.connector = Idefics2Connector(config) - self.text_model = AutoModel.from_config(config.text_config) + self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) self.image_seq_len = config.perceiver_config.resampler_n_latents self.image_token_id = self.config.image_token_id diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index b18d46723179e2..52f8fa610a948e 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1251,9 +1251,13 @@ def __init__(self, config: InstructBlipConfig): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config(config.text_config) + language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) else: - language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + language_model = AutoModelForSeq2SeqLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) if language_model._no_split_modules is not None: self._no_split_modules.extend(language_model._no_split_modules) diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 80dec5bc3dba58..8a6f959a4921ed 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -506,12 +506,16 @@ def __init__( if question_encoder is None: from ..auto.modeling_auto import AutoModel - question_encoder = AutoModel.from_config(config.question_encoder) + question_encoder = AutoModel.from_config( + config.question_encoder, attn_implementation=config._attn_implementation + ) if generator is None: from ..auto.modeling_auto import AutoModelForSeq2SeqLM - generator = AutoModelForSeq2SeqLM.from_config(config.generator) + generator = AutoModelForSeq2SeqLM.from_config( + config.generator, attn_implementation=config._attn_implementation + ) self.retriever = retriever if self.retriever is not None: diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 942dfb5f9c49fc..77b69afe8fd22f 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -212,10 +212,10 @@ def __init__( super().__init__(config) if encoder is None: - encoder = AutoModel.from_config(config.encoder) + encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) if decoder is None: - decoder = AutoModelForCausalLM.from_config(config.decoder) + decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) self.encoder = encoder self.decoder = decoder diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 0bdf76044153b1..fc72eb1cbdf831 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -190,10 +190,10 @@ def __init__( super().__init__(config) if encoder is None: - encoder = AutoModel.from_config(config.encoder) + encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) if decoder is None: - decoder = AutoModelForCausalLM.from_config(config.decoder) + decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) self.encoder = encoder self.decoder = decoder diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index cd4d5bd7a1f197..0f82bdd0c3e4d8 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -185,10 +185,12 @@ def __init__( if isinstance(config.vision_config, CLIPVisionConfig): vision_model = CLIPVisionModel(config.vision_config) else: - vision_model = AutoModel.from_config(config.vision_config) + vision_model = AutoModel.from_config( + config.vision_config, attn_implementation=config._attn_implementation + ) if text_model is None: - text_model = AutoModel.from_config(config.text_config) + text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) self.vision_model = vision_model self.text_model = text_model From bdbe1662113b49e81c865f0ca9d3922b500c5414 Mon Sep 17 00:00:00 2001 From: clinty Date: Mon, 29 Apr 2024 05:57:51 -0400 Subject: [PATCH 12/18] Fix broken link to Transformers notebooks (#30512) Co-authored-by: Clint Adams --- docs/source/de/run_scripts.md | 2 +- docs/source/en/run_scripts.md | 2 +- docs/source/es/run_scripts.md | 2 +- docs/source/it/run_scripts.md | 2 +- docs/source/pt/run_scripts.md | 2 +- docs/source/zh/run_scripts.md | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md index 61a0754ea92628..17b725827dd7ec 100644 --- a/docs/source/de/run_scripts.md +++ b/docs/source/de/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Trainieren mit einem Skript -Neben den 🤗 Transformers [notebooks](./noteboks/README) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert. +Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert. Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist. diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md index 845befc5638133..f602cde40933d0 100644 --- a/docs/source/en/run_scripts.md +++ b/docs/source/en/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Train with a script -Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). +Along with the 🤗 Transformers [notebooks](./notebooks), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library. diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md index ff1afa340c9a1d..d9a2b142a8ab6c 100644 --- a/docs/source/es/run_scripts.md +++ b/docs/source/es/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Entrenamiento con scripts -Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). +Junto con los [notebooks](./notebooks) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca. diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md index 7fc3fb6c6ac67a..b437efb9fb18f6 100644 --- a/docs/source/it/run_scripts.md +++ b/docs/source/it/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Addestramento con script -Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). +Insieme ai [notebooks](./notebooks) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria. diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md index a64ad72f1dbc61..d4cc3973608d2c 100644 --- a/docs/source/pt/run_scripts.md +++ b/docs/source/pt/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Treinamento a partir de um script -Junto com os 🤗 Transformers [notebooks](./noteboks/README), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). +Junto com os 🤗 Transformers [notebooks](./notebooks), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca. diff --git a/docs/source/zh/run_scripts.md b/docs/source/zh/run_scripts.md index b6e9c8ea6a2d89..d058e97d1ad587 100644 --- a/docs/source/zh/run_scripts.md +++ b/docs/source/zh/run_scripts.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # 使用脚本进行训练 -除了 🤗 Transformers [notebooks](./noteboks/README),还有示例脚本演示了如何使用[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)或[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)训练模型以解决特定任务。 +除了 🤗 Transformers [notebooks](./notebooks),还有示例脚本演示了如何使用[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)或[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)训练模型以解决特定任务。 您还可以在这些示例中找到我们在[研究项目](https://github.com/huggingface/transformers/tree/main/examples/research_projects)和[遗留示例](https://github.com/huggingface/transformers/tree/main/examples/legacy)中使用过的脚本,这些脚本主要是由社区贡献的。这些脚本已不再被积极维护,需要使用特定版本的🤗 Transformers, 可能与库的最新版本不兼容。 From c02421883b2a59e075ec87de8d82f02a944fb5e8 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:07:41 +0200 Subject: [PATCH 13/18] Update runner tag for PR slow CI (#30535) fix Co-authored-by: ydshieh --- .github/workflows/self-new-model-pr-caller.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-new-model-pr-caller.yml b/.github/workflows/self-new-model-pr-caller.yml index 888b4f7a8ce5ac..44a54ac675279b 100644 --- a/.github/workflows/self-new-model-pr-caller.yml +++ b/.github/workflows/self-new-model-pr-caller.yml @@ -46,7 +46,7 @@ jobs: matrix: folders: ["${{ needs.check_for_new_model.outputs.new_model }}"] machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ From 87be06ca77166e6a6215eee5a990ab9f07238a18 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:32:43 +0200 Subject: [PATCH 14/18] Fix repo. fetch/checkout in PR slow CI job (#30537) fix Co-authored-by: ydshieh --- .github/workflows/self-new-model-pr-caller.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-new-model-pr-caller.yml b/.github/workflows/self-new-model-pr-caller.yml index 44a54ac675279b..3bd19b6f2c3ff5 100644 --- a/.github/workflows/self-new-model-pr-caller.yml +++ b/.github/workflows/self-new-model-pr-caller.yml @@ -69,7 +69,7 @@ jobs: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ github.event.pull_request.head.sha }} + run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers From 9df8b301ceb6697a713fbe4034dc02395e5b28cd Mon Sep 17 00:00:00 2001 From: Benjamin Warner Date: Mon, 29 Apr 2024 11:45:43 -0500 Subject: [PATCH 15/18] Reenable SDPA's FA2 During Training with torch.compile (#30442) * Reenable SDPA's FA2 during training with torch.compile * fix Olmo's SDPA FA2 dispatching too * update formatting * improved SDPA comment * formatting and explanatory comment * is_causal if statement to one-liner --- src/transformers/modeling_attn_mask_utils.py | 7 ++++--- src/transformers/models/cohere/modeling_cohere.py | 13 +++++++++---- src/transformers/models/gemma/modeling_gemma.py | 13 +++++++++---- src/transformers/models/llama/modeling_llama.py | 13 +++++++++---- src/transformers/models/olmo/modeling_olmo.py | 11 +++++++++-- 5 files changed, 40 insertions(+), 17 deletions(-) diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py index 44ea1795669f58..8dcf40268d0324 100755 --- a/src/transformers/modeling_attn_mask_utils.py +++ b/src/transformers/modeling_attn_mask_utils.py @@ -240,6 +240,7 @@ def _ignore_causal_mask_sdpa( inputs_embeds: torch.Tensor, past_key_values_length: int, sliding_window: Optional[int] = None, + is_training: bool = False, ) -> bool: """ Detects whether the optional user-specified attention_mask & the automatically created causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument. @@ -263,11 +264,11 @@ def _ignore_causal_mask_sdpa( if attention_mask is None: # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input shape, thus SDPA's `is_causal` argument is rightfully updated (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using `torch.export` or # or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). - # Thus, we currently can NOT set `ignore_causal_mask = True` here. We would need a `torch._dynamo.is_exporting()` flag. + # Thus, we only set `ignore_causal_mask = True` if the model is set to training. # # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`). if ( - not is_tracing + (is_training or not is_tracing) and (query_length == 1 or key_value_length == query_length) and (sliding_window is None or key_value_length < sliding_window) ): @@ -279,7 +280,7 @@ def _ignore_causal_mask_sdpa( raise ValueError( f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}." ) - elif not is_tracing and torch.all(attention_mask == 1): + elif (is_training or not is_tracing) and torch.all(attention_mask == 1): if query_length == 1 or key_value_length == query_length: # For query_length == 1, causal attention and bi-directional attention are the same. ignore_causal_mask = True diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 41bb4c0516928c..3d529fd1ec4fe0 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -590,15 +590,17 @@ def forward( key_states = key_states.contiguous() value_states = value_states.contiguous() - # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather - # relying on the `is_causal` argument. + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + is_causal = True if causal_mask is None and q_len > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=causal_mask, dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=causal_mask is None and q_len > 1, + is_causal=is_causal, ) attn_output = attn_output.transpose(1, 2).contiguous() @@ -996,7 +998,10 @@ def _update_causal_mask( # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, # in order to dispatch on Flash Attention 2. if AttentionMaskConverter._ignore_causal_mask_sdpa( - attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, ): return None diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index e5b6b207748a53..97e4e5d49f8e0f 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -570,15 +570,17 @@ def forward( key_states = key_states.contiguous() value_states = value_states.contiguous() - # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather - # relying on the `is_causal` argument. + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + is_causal = True if causal_mask is None and q_len > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=causal_mask, dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=causal_mask is None and q_len > 1, + is_causal=is_causal, ) attn_output = attn_output.transpose(1, 2).contiguous() @@ -982,7 +984,10 @@ def _update_causal_mask( # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, # in order to dispatch on Flash Attention 2. if AttentionMaskConverter._ignore_causal_mask_sdpa( - attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, ): return None diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 905edf5f71a63d..9a2566f2fdd2eb 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -666,15 +666,17 @@ def forward( key_states = key_states.contiguous() value_states = value_states.contiguous() - # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather - # relying on the `is_causal` argument. + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + is_causal = True if causal_mask is None and q_len > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=causal_mask, dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=causal_mask is None and q_len > 1, + is_causal=is_causal, ) attn_output = attn_output.transpose(1, 2).contiguous() @@ -1074,7 +1076,10 @@ def _update_causal_mask( # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, # in order to dispatch on Flash Attention 2. if AttentionMaskConverter._ignore_causal_mask_sdpa( - attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, ): return None diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index e3b0e05127c52d..87db966e2d8f67 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -647,13 +647,17 @@ def forward( key_states = key_states.contiguous() value_states = value_states.contiguous() + # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an + # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True` + is_causal = True if causal_mask is None and q_len > 1 else False + attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=causal_mask, dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=causal_mask is None and q_len > 1, + is_causal=is_causal, ) attn_output = attn_output.transpose(1, 2).contiguous() @@ -1057,7 +1061,10 @@ def _update_causal_mask( # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, # in order to dispatch on Flash Attention 2. if AttentionMaskConverter._ignore_causal_mask_sdpa( - attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, ): return None From a3aabc702e1c49243e7b48f22d88362d50e786c5 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 29 Apr 2024 14:47:26 -0400 Subject: [PATCH 16/18] Include safetensors as part of `_load_best_model` (#30553) * Include safetensors * Cleanup --- src/transformers/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1d3c164984ea1c..d967c9314b1025 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2611,7 +2611,9 @@ def _load_best_model(self): load_result = model.load_state_dict(state_dict, False) if not is_sagemaker_mp_enabled() and has_been_loaded: self._issue_warnings_after_load(load_result) - elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)): + elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists( + os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME) + ): load_result = load_sharded_checkpoint( model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled() ) From c712d05aa8fc8ba3ebe465079bd377d2dc9c2e07 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 30 Apr 2024 12:16:18 +0500 Subject: [PATCH 17/18] Pass `use_cache` in kwargs for GPTNeoX (#30538) pass use_cache in kwargs --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 83c99202ac9379..e338c529abf293 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -1108,6 +1108,7 @@ def prepare_inputs_for_generation( "attention_mask": attention_mask, "past_key_values": past_key_values, "position_ids": position_ids, + "use_cache": kwargs.get("use_cache"), } ) From 0ae789e04330e15a90e34cd723c851a8ab8d7ec5 Mon Sep 17 00:00:00 2001 From: Jacky Lee <39754370+jla524@users.noreply.github.com> Date: Tue, 30 Apr 2024 04:09:08 -0700 Subject: [PATCH 18/18] Enable multi-device for more models (#30409) * feat: support for dinov2 * feat: support for depth_anything * feat: support for efficientformer * feat: support for bert (is this right?) * update: embedding split * remove: empty string * feat: support for align * fix: copies * fix: QAQBertEmbeddings * fix: more consistency issues * revert: support for effientformer * feat: support for altclip * feat: support for blip_text * support for ChineseCLIP * feat: support for depth anything * feat: support for dpt * feat: support for dpt * feat: support for git * feat: support for groupvit * update: format * fix: support for clip * fix: consistency * feat: support for pvt * feat: support for vit_msn * fix: consistency * fix: other copies * remove: device transfer * revert: in-place add * update: support for align * update: support for bert * update: support for Chinese CLIP * revert: changes to efficientformer * update: support for dpt * update: support for efficientformer * revert: changes to git * revert: changes to groupvit * revert: changes to roc_bert * update: support for vit_msn * revert: changes to dpt * remove: extra space * style: extra space --- src/transformers/models/align/modeling_align.py | 1 + src/transformers/models/altclip/modeling_altclip.py | 1 + src/transformers/models/bert/modeling_bert.py | 2 ++ src/transformers/models/chinese_clip/modeling_chinese_clip.py | 2 ++ .../models/depth_anything/modeling_depth_anything.py | 2 ++ src/transformers/models/dinov2/modeling_dinov2.py | 1 + .../models/efficientformer/modeling_efficientformer.py | 1 + src/transformers/models/pvt/modeling_pvt.py | 1 + src/transformers/models/vit_msn/modeling_vit_msn.py | 1 + 9 files changed, 12 insertions(+) diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 0f8246e8f98c90..4fa128a5f67fa8 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -1203,6 +1203,7 @@ def _init_weights(self, module): ) class AlignTextModel(AlignPreTrainedModel): config_class = AlignTextConfig + _no_split_modules = ["AlignTextEmbeddings"] def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True): super().__init__(config) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index ba8abb311a8d2a..3e184085331720 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -1034,6 +1034,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel): config_class = AltCLIPConfig base_model_prefix = "altclip" supports_gradient_checkpointing = True + _no_split_module = [] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index f7af0f1ef5a48c..9e2847b11b53f0 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -962,6 +962,8 @@ class BertModel(BertPreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ + _no_split_modules = ["BertEmbeddings"] + def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 87a1baa217baf1..7d5c8f2fcc855d 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -1113,6 +1113,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel): """ config_class = ChineseCLIPTextConfig + _no_split_modules = ["ChineseCLIPTextEmbeddings"] def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -1284,6 +1285,7 @@ def forward( class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel): config_class = ChineseCLIPVisionConfig main_input_name = "pixel_values" + _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPVisionAttention"] def __init__(self, config: ChineseCLIPVisionConfig): super().__init__(config) diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index bed91ac2a482bc..043bd0fac807b2 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -364,6 +364,8 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) DEPTH_ANYTHING_START_DOCSTRING, ) class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): + _no_split_modules = ["DPTViTEmbeddings"] + def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index c25022f6ec22d8..c90221f145d4ba 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -481,6 +481,7 @@ class Dinov2PreTrainedModel(PreTrainedModel): base_model_prefix = "dinov2" main_input_name = "pixel_values" supports_gradient_checkpointing = True + _no_split_modules = ["Dinov2SwiGLUFFN"] def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: """Initialize the weights""" diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py index 70075cff55d7d9..cc62e9cbd21e40 100644 --- a/src/transformers/models/efficientformer/modeling_efficientformer.py +++ b/src/transformers/models/efficientformer/modeling_efficientformer.py @@ -555,6 +555,7 @@ class EfficientFormerModel(EfficientFormerPreTrainedModel): def __init__(self, config: EfficientFormerConfig): super().__init__(config) self.config = config + _no_split_modules = ["EfficientFormerMeta4D"] self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0]) self.encoder = EfficientFormerEncoder(config) diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index b169af0cbd5668..4574ca37876039 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -462,6 +462,7 @@ class PvtPreTrainedModel(PreTrainedModel): config_class = PvtConfig base_model_prefix = "pvt" main_input_name = "pixel_values" + _no_split_modules = [] def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: """Initialize the weights""" diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index 9c2269a3ae546f..424d657dc87859 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -421,6 +421,7 @@ class ViTMSNPreTrainedModel(PreTrainedModel): base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True + _no_split_modules = ["ViTMSNAttention"] # todo: Resort to https://github.com/facebookresearch/msn/blob/main/src/deit.py#L200-#L211 # when creating pre-training scripts.