From 8d5a22e6eb624b6128c833fce97e81e9fd3c8376 Mon Sep 17 00:00:00 2001 From: ka00ri Date: Wed, 23 Nov 2022 14:33:30 +0100 Subject: [PATCH 1/4] add BetterTransfomer support for ViLT architecture --- optimum/bettertransformer/models/__init__.py | 2 + .../models/encoder_models.py | 96 +++++++++++++++++++ .../test_bettertransformer_vision.py | 1 + 3 files changed, 99 insertions(+) diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 9dfe8dd863c..2da5abae46c 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -19,6 +19,7 @@ BertLayerBetterTransformer, DistilBertLayerBetterTransformer, ViTLayerBetterTransformer, + ViltLayerBetterTransformer, Wav2Vec2EncoderLayerBetterTransformer, WhisperEncoderLayerBetterTransformer, ) @@ -65,6 +66,7 @@ "ViTMAELayer": ViTLayerBetterTransformer, "ViTMSNLayer": ViTLayerBetterTransformer, "YolosLayer": ViTLayerBetterTransformer, + "ViltLayer": ViltLayerBetterTransformer, } diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py index c6723d43114..d6a80a4241d 100644 --- a/optimum/bettertransformer/models/encoder_models.py +++ b/optimum/bettertransformer/models/encoder_models.py @@ -644,6 +644,102 @@ def forward(self, hidden_states, *_, **__): return (hidden_states,) +class ViltLayerBetterTransformer(BetterTransformerBaseLayer): + def __init__(self, vilt_layer, config): + r""" + A simple conversion of the VilTLayer to its `BetterTransformer` implementation. + + Args: + vilt_layer (`torch.nn.Module`): + The original `VilTLayer` where the weights needs to be retrieved. + """ + super().__init__(config) + # In_proj layer + self.in_proj_weight = nn.Parameter( + torch.cat( + [ + vilt_layer.attention.attention.query.weight, + vilt_layer.attention.attention.key.weight, + vilt_layer.attention.attention.value.weight, + ] + ) + ) + self.in_proj_bias = nn.Parameter( + torch.cat( + [ + vilt_layer.attention.attention.query.bias, + vilt_layer.attention.attention.key.bias, + vilt_layer.attention.attention.value.bias, + ] + ) + ) + + # Out proj layer + self.out_proj_weight = vilt_layer.attention.output.dense.weight + self.out_proj_bias = vilt_layer.attention.output.dense.bias + + # Linear layer 1 + self.linear1_weight = vilt_layer.intermediate.dense.weight + self.linear1_bias = vilt_layer.intermediate.dense.bias + + # Linear layer 2 + self.linear2_weight = vilt_layer.output.dense.weight + self.linear2_bias = vilt_layer.output.dense.bias + + # Layer norm 1 + self.norm1_eps = vilt_layer.layernorm_before.eps + self.norm1_weight = vilt_layer.layernorm_before.weight + self.norm1_bias = vilt_layer.layernorm_before.bias + + # Layer norm 2 + self.norm2_eps = vilt_layer.layernorm_after.eps + self.norm2_weight = vilt_layer.layernorm_after.weight + self.norm2_bias = vilt_layer.layernorm_after.bias + + # Model hyper parameters + self.num_heads = vilt_layer.attention.attention.num_attention_heads + self.embed_dim = int(vilt_layer.attention.attention.attention_head_size * self.num_heads) + + # Last step: set the last layer to `False` -> this will be set to `True` when converting the model + self.is_last_layer = False + self.norm_first = True + + self.validate_bettertransformer() + + def forward(self, hidden_states, *_, **__): + r""" + This is just a wrapper around the forward function proposed in: + https://github.com/huggingface/transformers/pull/19553 + """ + super().forward_checker() + attention_mask = None + + hidden_states = torch._transformer_encoder_layer_fwd( + hidden_states, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj_weight, + self.out_proj_bias, + self.use_gelu, + self.norm_first, + self.norm1_eps, + self.norm1_weight, + self.norm1_bias, + self.norm2_weight, + self.norm2_bias, + self.linear1_weight, + self.linear1_bias, + self.linear2_weight, + self.linear2_bias, + attention_mask, + ) + if hidden_states.is_nested and self.is_last_layer: + hidden_states = hidden_states.to_padded_tensor(0.0) + return (hidden_states,) + + class Wav2Vec2EncoderLayerBetterTransformer(BetterTransformerBaseLayer): def __init__(self, wav2vec2_layer, config): r""" diff --git a/tests/bettertransformer/test_bettertransformer_vision.py b/tests/bettertransformer/test_bettertransformer_vision.py index 7c36cbf8e90..be8eeb7a039 100644 --- a/tests/bettertransformer/test_bettertransformer_vision.py +++ b/tests/bettertransformer/test_bettertransformer_vision.py @@ -27,6 +27,7 @@ "hf-internal-testing/tiny-random-ViTMAEModel", "hf-internal-testing/tiny-random-ViTMSNModel", "hf-internal-testing/tiny-random-deit", + "hf-internal-testing/tiny-random-ViltModel", ] From a487004cd16eefdf8ccd65021349e35706baac91 Mon Sep 17 00:00:00 2001 From: ka00ri Date: Wed, 23 Nov 2022 17:41:44 +0100 Subject: [PATCH 2/4] fixed alphabetic order --- docs/source/bettertransformer/overview.mdx | 5 ++-- optimum/bettertransformer/models/__init__.py | 2 +- .../test_bettertransformer_vision.py | 23 ++++++++++++++++++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx index 4e4fecfae1b..aca1b9797a1 100644 --- a/docs/source/bettertransformer/overview.mdx +++ b/docs/source/bettertransformer/overview.mdx @@ -38,12 +38,13 @@ The list of supported model below: - [MarkupLM](https://arxiv.org/abs/2110.08518) - [RoBERTa](https://arxiv.org/abs/1907.11692) - [Splinter](https://arxiv.org/abs/2101.00438) -- [XLMRoberta](https://arxiv.org/abs/1911.02116) -- [Whisper](https://cdn.openai.com/papers/whisper.pdf) +- [ViLT](https://arxiv.org/abs/2102.03334) - [ViT](https://arxiv.org/abs/2010.11929) - [ViT-MAE](https://arxiv.org/abs/2111.06377) - [ViT-MSN](https://arxiv.org/abs/2204.07141) - [Wav2Vec2](https://arxiv.org/abs/2006.11477) +- [Whisper](https://cdn.openai.com/papers/whisper.pdf) +- [XLMRoberta](https://arxiv.org/abs/1911.02116) - [YOLOS](https://arxiv.org/abs/2106.00666) Let us know by opening an issue in 🤗 Optimum if you want more models to be supported, or check out the contribution guideline if you want to add it by yourself! diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py index 2da5abae46c..448425a268b 100644 --- a/optimum/bettertransformer/models/__init__.py +++ b/optimum/bettertransformer/models/__init__.py @@ -18,8 +18,8 @@ BartEncoderLayerBetterTransformer, BertLayerBetterTransformer, DistilBertLayerBetterTransformer, - ViTLayerBetterTransformer, ViltLayerBetterTransformer, + ViTLayerBetterTransformer, Wav2Vec2EncoderLayerBetterTransformer, WhisperEncoderLayerBetterTransformer, ) diff --git a/tests/bettertransformer/test_bettertransformer_vision.py b/tests/bettertransformer/test_bettertransformer_vision.py index be8eeb7a039..5dcfc00c507 100644 --- a/tests/bettertransformer/test_bettertransformer_vision.py +++ b/tests/bettertransformer/test_bettertransformer_vision.py @@ -15,7 +15,7 @@ import unittest from PIL import Image -from transformers import AutoFeatureExtractor +from transformers import AutoFeatureExtractor, AutoProcessor import requests from testing_bettertransformer_utils import BetterTransformersTestMixin @@ -27,6 +27,10 @@ "hf-internal-testing/tiny-random-ViTMAEModel", "hf-internal-testing/tiny-random-ViTMSNModel", "hf-internal-testing/tiny-random-deit", +] + + +ALL_VISION_TEXT_MODELS_TO_TEST = [ "hf-internal-testing/tiny-random-ViltModel", ] @@ -45,3 +49,20 @@ def prepare_inputs_for_class(self, model_id=None): feature_extractor = AutoFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-ViTModel") inputs = feature_extractor(images=image, return_tensors="pt") return inputs + + +class BetterTransformersViLTTest(BetterTransformersTestMixin, unittest.TestCase): + r""" + Testing suite for Vision and Text Models - tests all the tests defined in `BetterTransformersTestMixin` + """ + all_models_to_test = ALL_VISION_TEXT_MODELS_TO_TEST + + def prepare_inputs_for_class(self, model_id=None): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + text = "How many cats are there?" + + # Model takes image and text as input + processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-ViltModel") + inputs = processor(image, text, return_tensors="pt") + return inputs From a0ebf9651d69cead9fe7c82928f46ecd39ac961a Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:08:05 +0100 Subject: [PATCH 3/4] Update tests/bettertransformer/test_bettertransformer_vision.py --- tests/bettertransformer/test_bettertransformer_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bettertransformer/test_bettertransformer_vision.py b/tests/bettertransformer/test_bettertransformer_vision.py index 5dcfc00c507..be374ef9602 100644 --- a/tests/bettertransformer/test_bettertransformer_vision.py +++ b/tests/bettertransformer/test_bettertransformer_vision.py @@ -31,7 +31,7 @@ ALL_VISION_TEXT_MODELS_TO_TEST = [ - "hf-internal-testing/tiny-random-ViltModel", + "hf-internal-testing/tiny-vilt-random-vqa", ] From a90ac106faf24d4771c6b8792aa4bdc5ef84b9da Mon Sep 17 00:00:00 2001 From: "m. bou" <113170426+ka00ri@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:37:55 +0100 Subject: [PATCH 4/4] Update tests/bettertransformer/test_bettertransformer_vision.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- tests/bettertransformer/test_bettertransformer_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bettertransformer/test_bettertransformer_vision.py b/tests/bettertransformer/test_bettertransformer_vision.py index be374ef9602..0f860e0a6c9 100644 --- a/tests/bettertransformer/test_bettertransformer_vision.py +++ b/tests/bettertransformer/test_bettertransformer_vision.py @@ -63,6 +63,6 @@ def prepare_inputs_for_class(self, model_id=None): text = "How many cats are there?" # Model takes image and text as input - processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-ViltModel") + processor = AutoProcessor.from_pretrained(model_id) inputs = processor(image, text, return_tensors="pt") return inputs