From e39c9f7a78fa2960a7045e8fc5a2d96b5d7eebf1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 10 Jan 2025 10:12:04 +0100
Subject: [PATCH] v4.48-release

---
 examples/flax/question-answering/run_qa.py    |    2 +-
 .../run_flax_speech_recognition_seq2seq.py    |    2 +-
 .../flax/text-classification/run_flax_glue.py |    2 +-
 .../flax/token-classification/run_flax_ner.py |    2 +-
 .../run_audio_classification.py               |    2 +-
 .../contrastive-image-text/run_clip.py        |    2 +-
 .../run_image_classification.py               |    2 +-
 .../run_image_classification_no_trainer.py    |    2 +-
 examples/pytorch/image-pretraining/run_mae.py |    2 +-
 examples/pytorch/image-pretraining/run_mim.py |    2 +-
 .../image-pretraining/run_mim_no_trainer.py   |    2 +-
 .../run_instance_segmentation.py              |    2 +-
 .../run_instance_segmentation_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_clm.py |    2 +-
 .../language-modeling/run_clm_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_fim.py |    2 +-
 .../language-modeling/run_fim_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_mlm.py |    2 +-
 .../language-modeling/run_mlm_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_plm.py |    2 +-
 examples/pytorch/multiple-choice/run_swag.py  |    2 +-
 .../multiple-choice/run_swag_no_trainer.py    |    2 +-
 .../object-detection/run_object_detection.py  |    2 +-
 .../run_object_detection_no_trainer.py        |    2 +-
 examples/pytorch/question-answering/run_qa.py |    2 +-
 .../question-answering/run_qa_beam_search.py  |    2 +-
 .../run_qa_beam_search_no_trainer.py          |    2 +-
 .../question-answering/run_qa_no_trainer.py   |    2 +-
 .../question-answering/run_seq2seq_qa.py      |    2 +-
 .../run_semantic_segmentation.py              |    2 +-
 .../run_semantic_segmentation_no_trainer.py   |    2 +-
 .../run_speech_recognition_ctc.py             |    2 +-
 .../run_speech_recognition_ctc_adapter.py     |    2 +-
 .../run_speech_recognition_seq2seq.py         |    2 +-
 .../summarization/run_summarization.py        |    2 +-
 .../run_summarization_no_trainer.py           |    2 +-
 .../text-classification/run_classification.py |    2 +-
 .../pytorch/text-classification/run_glue.py   |    2 +-
 .../run_glue_no_trainer.py                    |    2 +-
 .../pytorch/text-classification/run_xnli.py   |    2 +-
 .../pytorch/token-classification/run_ner.py   |    2 +-
 .../run_ner_no_trainer.py                     |    2 +-
 .../pytorch/translation/run_translation.py    |    2 +-
 .../translation/run_translation_no_trainer.py |    2 +-
 .../contrastive-image-text/run_clip.py        |    2 +-
 .../run_image_classification.py               |    2 +-
 .../tensorflow/multiple-choice/run_swag.py    |    2 +-
 .../tensorflow/question-answering/run_qa.py   |    2 +-
 .../summarization/run_summarization.py        |    2 +-
 .../text-classification/run_glue.py           |    2 +-
 .../tensorflow/translation/run_translation.py |    2 +-
 setup.py                                      |    2 +-
 src/transformers/__init__.py                  |    2 +-
 ...lbert_original_tf_checkpoint_to_pytorch.py |   62 -
 .../models/align/convert_align_tf_to_hf.py    |  389 -----
 .../models/aria/convert_aria_weights_to_hf.py |  162 --
 ...trogram_transformer_original_to_pytorch.py |  279 ----
 .../bamba/convert_mamba_ssm_checkpoint.py     |  273 ----
 .../models/bark/convert_suno_to_hf.py         |  263 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  156 --
 .../beit/convert_beit_unilm_to_pytorch.py     |  373 -----
 ...bert_original_tf2_checkpoint_to_pytorch.py |  246 ---
 ..._bert_original_tf_checkpoint_to_pytorch.py |   62 -
 ..._bert_pytorch_checkpoint_to_original_tf.py |  112 --
 ...ping_original_tf2_checkpoint_to_pytorch.py |  188 ---
 ...gbird_original_tf_checkpoint_to_pytorch.py |   69 -
 .../convert_bigbird_pegasus_tf_to_pytorch.py  |  170 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  292 ----
 .../models/bit/convert_bit_to_pytorch.py      |  177 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  114 --
 .../convert_blip_original_pytorch_to_hf.py    |  191 ---
 .../convert_blip_2_original_to_pytorch.py     |  390 -----
 ...rt_bloom_original_checkpoint_to_pytorch.py |  254 ----
 .../models/bros/convert_bros_to_pytorch.py    |  145 --
 ..._byt5_original_tf_checkpoint_to_pytorch.py |   59 -
 ...anine_original_tf_checkpoint_to_pytorch.py |   65 -
 .../convert_chameleon_weights_to_hf.py        |  476 ------
 ...ert_chinese_clip_original_pytorch_to_hf.py |  134 --
 .../convert_clap_original_pytorch_to_hf.py    |  133 --
 .../convert_clip_original_pytorch_to_hf.py    |  156 --
 .../convert_clipseg_original_pytorch_to_hf.py |  264 ----
 .../models/clvp/convert_clvp_to_hf.py         |  234 ---
 .../colpali/convert_colpali_weights_to_hf.py  |  214 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  324 ----
 ...ginal_tf1_checkpoint_to_pytorch_and_tf2.py |   57 -
 .../convnext/convert_convnext_to_pytorch.py   |  242 ---
 .../convert_convnextv2_to_pytorch.py          |  286 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  362 -----
 .../models/dac/convert_dac_checkpoint.py      |  261 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  285 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  207 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  374 -----
 .../convert_deformable_detr_to_pytorch.py     |  236 ---
 .../deit/convert_deit_timm_to_pytorch.py      |  218 ---
 ...original_gluonnlp_checkpoint_to_pytorch.py |  318 ----
 .../deta/convert_deta_resnet_to_pytorch.py    |  319 ----
 .../deta/convert_deta_swin_to_pytorch.py      |  326 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  252 ----
 ...convert_gptsan_tf_checkpoint_to_pytorch.py |  181 ---
 .../deprecated/jukebox/convert_jukebox.py     |  279 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  292 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |   70 -
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |  121 --
 .../deprecated/van/convert_van_to_pytorch.py  |  290 ----
 .../convert_vit_hybrid_timm_to_pytorch.py     |  282 ----
 .../convert_depth_anything_to_hf.py           |  368 -----
 ..._original_pytorch_checkpoint_to_pytorch.py |  277 ----
 .../models/detr/convert_detr_to_pytorch.py    |  385 -----
 ..._original_pytorch_checkpoint_to_pytorch.py |   46 -
 .../models/dinov2/convert_dinov2_to_hf.py     |  285 ----
 .../convert_dinov2_with_registers_to_hf.py    |  291 ----
 .../dit/convert_dit_unilm_to_pytorch.py       |  230 ---
 .../models/donut/convert_donut_to_pytorch.py  |  234 ---
 ...vert_dpr_original_checkpoint_to_pytorch.py |  143 --
 .../models/dpt/convert_dinov2_depth_to_hf.py  |  383 -----
 .../models/dpt/convert_dpt_beit_to_hf.py      |  305 ----
 .../dpt/convert_dpt_hybrid_to_pytorch.py      |  315 ----
 .../models/dpt/convert_dpt_swinv2_to_hf.py    |  321 ----
 .../models/dpt/convert_dpt_to_pytorch.py      |  285 ----
 .../convert_efficientnet_to_pytorch.py        |  339 -----
 ...ectra_original_tf_checkpoint_to_pytorch.py |   79 -
 .../convert_encodec_checkpoint_to_pytorch.py  |  365 -----
 src/transformers/models/esm/convert_esm.py    |  399 -----
 .../falcon/convert_custom_code_checkpoint.py  |   74 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  210 ---
 .../fastspeech2_conformer/convert_hifigan.py  |  134 --
 .../convert_model_with_hifigan.py             |  102 --
 .../flava/convert_dalle_to_flava_codebook.py  |  102 --
 .../convert_flava_original_pytorch_to_hf.py   |   99 --
 ...net_original_flax_checkpoint_to_pytorch.py |  156 --
 .../focalnet/convert_focalnet_to_hf_format.py |  237 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  280 ----
 ...unnel_original_tf_checkpoint_to_pytorch.py |   67 -
 .../fuyu/convert_fuyu_model_weights_to_hf.py  |  134 --
 .../gemma/convert_gemma_weights_to_hf.py      |  206 ---
 .../gemma2/convert_gemma2_weights_to_hf.py    |  239 ---
 .../models/git/convert_git_to_pytorch.py      |  448 ------
 .../models/glm/convert_glm_weights_to_hf.py   |  195 ---
 .../models/glpn/convert_glpn_to_pytorch.py    |  218 ---
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |   68 -
 .../convert_gpt_neo_mesh_tf_to_pytorch.py     |   71 -
 .../gpt_sw3/convert_megatron_to_pytorch.py    |  197 ---
 .../convert_grounding_dino_to_hf.py           |  491 ------
 .../groupvit/convert_groupvit_nvlab_to_hf.py  |  217 ---
 .../models/hiera/convert_hiera_to_hf.py       |  369 -----
 ...rt_original_s3prl_checkpoint_to_pytorch.py |  222 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  261 ----
 ...rt_original_s3prl_checkpoint_to_pytorch.py |   68 -
 .../convert_idefics2_weights_to_hf.py         |  185 ---
 .../convert_idefics3_weights_to_hf.py         |  214 ---
 .../models/ijepa/convert_ijepa_to_hf.py       |  267 ----
 ...onvert_imagegpt_original_tf2_to_pytorch.py |   71 -
 ...onvert_instructblip_original_to_pytorch.py |  303 ----
 ...t_instructblipvideo_original_to_pytorch.py |  305 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |   77 -
 .../levit/convert_levit_timm_to_pytorch.py    |  180 ---
 .../llama/convert_llama_weights_to_hf.py      |  601 --------
 .../llava/convert_llava_weights_to_hf.py      |  204 ---
 .../convert_llava_next_weights_to_hf.py       |  397 -----
 .../convert_llava_next_video_weights_to_hf.py |  276 ----
 .../convert_llava_onevision_weights_to_hf.py  |  388 -----
 ...r_original_pytorch_lightning_to_pytorch.py |   85 --
 .../convert_longt5x_checkpoint_to_flax.py     |  215 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  170 ---
 ...xmert_original_tf_checkpoint_to_pytorch.py |   59 -
 ...t_m2m100_original_checkpoint_to_pytorch.py |   85 --
 ...convert_mamba_ssm_checkpoint_to_pytorch.py |  153 --
 ...onvert_mamba2_ssm_checkpoint_to_pytorch.py |  193 ---
 .../convert_marian_tatoeba_to_pytorch.py      | 1327 -----------------
 .../marian/convert_marian_to_pytorch.py       |  717 ---------
 ..._original_pytorch_checkpoint_to_pytorch.py | 1019 -------------
 ..._original_pytorch_checkpoint_to_pytorch.py |  731 ---------
 .../convert_maskformer_resnet_to_pytorch.py   |  390 -----
 .../convert_maskformer_swin_to_pytorch.py     |  333 -----
 ...rt_mbart_original_checkpoint_to_pytorch.py |   83 --
 .../convert_megatron_bert_checkpoint.py       |  334 -----
 .../convert_megatron_gpt2_checkpoint.py       |  358 -----
 .../convert_mimi_checkpoint_to_pytorch.py     |  198 ---
 .../mistral/convert_mistral_weights_to_hf.py  |  276 ----
 .../mixtral/convert_mixtral_weights_to_hf.py  |  244 ---
 .../mllama/convert_mllama_weights_to_hf.py    |  639 --------
 ..._original_pytorch_checkpoint_to_pytorch.py |  229 ---
 ...ebert_original_tf_checkpoint_to_pytorch.py |   58 -
 ...nvert_original_tf_checkpoint_to_pytorch.py |  141 --
 ...nvert_original_tf_checkpoint_to_pytorch.py |  177 ---
 .../mobilevit/convert_mlcvnets_to_pytorch.py  |  311 ----
 .../convert_mlcvnets_to_pytorch.py            |  330 ----
 .../moshi/convert_moshi_transformers.py       |  311 ----
 .../mra/convert_mra_pytorch_to_pytorch.py     |  110 --
 .../musicgen/convert_musicgen_transformers.py |  236 ---
 .../convert_musicgen_melody_transformers.py   |  267 ----
 ..._myt5_original_tf_checkpoint_to_pytorch.py |   60 -
 .../nemotron/convert_nemotron_nemo_to_hf.py   |  346 -----
 ..._sharded_original_checkpoint_to_pytorch.py |  160 --
 .../models/nougat/convert_nougat_to_hf.py     |  282 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  111 --
 .../models/olmo/convert_olmo_weights_to_hf.py |  248 ---
 .../olmo2/convert_olmo2_weights_to_hf.py      |  304 ----
 .../olmoe/convert_olmoe_weights_to_hf.py      |  281 ----
 .../omdet_turbo/convert_omdet_turbo_to_hf.py  |  349 -----
 .../oneformer/convert_to_hf_oneformer.py      | 1191 ---------------
 ...penai_original_tf_checkpoint_to_pytorch.py |   74 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  113 --
 .../models/owlv2/convert_owlv2_to_hf.py       |  422 ------
 .../convert_owlvit_original_flax_to_hf.py     |  406 -----
 .../convert_paligemma2_weights_to_hf.py       |  415 ------
 .../convert_paligemma_weights_to_hf.py        |  347 -----
 .../pegasus/convert_pegasus_tf_to_pytorch.py  |  131 --
 .../convert_perceiver_haiku_to_pytorch.py     |  468 ------
 .../convert_persimmon_weights_to_hf.py        |  129 --
 .../models/phi/convert_phi_weights_to_hf.py   |  207 ---
 ...nvert_pix2struct_original_pytorch_to_hf.py |  155 --
 .../pixtral/convert_pixtral_weights_to_hf.py  |  319 ----
 ...ert_plbart_original_checkpoint_to_torch.py |   94 --
 .../convert_poolformer_original_to_pytorch.py |  214 ---
 .../convert_pop2piano_weights_to_hf.py        |  190 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  159 --
 .../models/pvt/convert_pvt_to_pytorch.py      |  226 ---
 .../pvt_v2/convert_pvt_v2_to_pytorch.py       |  295 ----
 .../convert_recurrent_gemma_to_hf.py          |  222 ---
 ...ert_reformer_trax_checkpoint_to_pytorch.py |  226 ---
 .../convert_regnet_seer_10b_to_pytorch.py     |  304 ----
 .../regnet/convert_regnet_to_pytorch.py       |  458 ------
 ...onvert_rembert_tf_checkpoint_to_pytorch.py |   62 -
 .../resnet/convert_resnet_to_pytorch.py       |  199 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  177 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |   77 -
 ...ormer_original_tf_checkpoint_to_pytorch.py |   62 -
 ..._detr_original_pytorch_checkpoint_to_hf.py |  782 ----------
 .../rwkv/convert_rwkv_checkpoint_to_hf.py     |  209 ---
 .../models/sam/convert_sam_to_hf.py           |  251 ----
 .../seamless_m4t/convert_fairseq2_to_hf.py    |  396 -----
 .../seamless_m4t_v2/convert_fairseq2_to_hf.py |  404 -----
 .../convert_segformer_original_to_pytorch.py  |  387 -----
 .../models/seggpt/convert_seggpt_to_hf.py     |  221 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  305 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  317 ----
 .../models/siglip/convert_siglip_to_hf.py     |  412 -----
 ...rt_wav2vec2_seq2seq_original_to_pytorch.py |  357 -----
 ...xt_wav2vec2_seq2seq_original_to_pytorch.py |  316 ----
 .../convert_s2t_fairseq_to_tfms.py            |  121 --
 .../models/speecht5/convert_hifigan.py        |  108 --
 ..._original_pytorch_checkpoint_to_pytorch.py |  401 -----
 .../convert_superpoint_to_pytorch.py          |  175 ---
 .../convert_swiftformer_original_to_hf.py     |  175 ---
 .../swin/convert_swin_simmim_to_pytorch.py    |  182 ---
 .../swin/convert_swin_timm_to_pytorch.py      |  173 ---
 .../convert_swin2sr_original_to_pytorch.py    |  278 ----
 .../swinv2/convert_swinv2_timm_to_pytorch.py  |  219 ---
 .../switch_transformers/convert_big_switch.py |  193 ---
 ...ers_original_flax_checkpoint_to_pytorch.py |  203 ---
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |   59 -
 .../t5/convert_t5x_checkpoint_to_flax.py      |  235 ---
 .../t5/convert_t5x_checkpoint_to_pytorch.py   |  238 ---
 .../convert_table_transformer_to_hf.py        |  317 ----
 ...convert_table_transformer_to_hf_no_timm.py |  434 ------
 ...tapas_original_tf_checkpoint_to_pytorch.py |  137 --
 .../models/textnet/convert_textnet_to_hf.py   |  208 ---
 .../convert_timesformer_to_pytorch.py         |  253 ----
 .../trocr/convert_trocr_unilm_to_pytorch.py   |  237 ---
 .../models/udop/convert_udop_to_hf.py         |  224 ---
 .../convert_umt5_checkpoint_to_pytorch.py     |  274 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  273 ----
 ...ch_original_s3prl_checkpoint_to_pytorch.py |  109 --
 ..._original_pytorch_checkpoint_to_pytorch.py |  224 ---
 .../models/univnet/convert_univnet.py         |  162 --
 .../convert_convnext_upernet_to_pytorch.py    |  214 ---
 .../convert_swin_upernet_to_pytorch.py        |  297 ----
 .../convert_video_llava_weights_to_hf.py      |  159 --
 .../videomae/convert_videomae_to_pytorch.py   |  324 ----
 .../vilt/convert_vilt_original_to_pytorch.py  |  299 ----
 .../convert_vipllava_weights_to_hf.py         |  132 --
 ..._original_pytorch_checkpoint_to_pytorch.py |  149 --
 .../models/vit/convert_dino_to_pytorch.py     |  218 ---
 .../models/vit/convert_vit_timm_to_pytorch.py |  254 ----
 .../vit_mae/convert_vit_mae_to_pytorch.py     |  178 ---
 .../models/vit_msn/convert_msn_to_pytorch.py  |  245 ---
 .../models/vitmatte/convert_vitmatte_to_hf.py |  170 ---
 .../models/vitpose/convert_vitpose_to_hf.py   |  355 -----
 .../vits/convert_original_checkpoint.py       |  390 -----
 .../vivit/convert_vivit_flax_to_pytorch.py    |  231 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  385 -----
 ...c2_original_s3prl_checkpoint_to_pytorch.py |  109 --
 .../convert_wav2vec2_seamless_checkpoint.py   |  217 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  309 ----
 ..._original_pytorch_checkpoint_to_pytorch.py |  206 ---
 ...lm_original_s3prl_checkpoint_to_pytorch.py |  109 --
 .../models/whisper/convert_openai_to_hf.py    |  370 -----
 .../convert_x_clip_original_pytorch_to_hf.py  |  386 -----
 .../convert_xglm_original_ckpt_to_trfms.py    |   68 -
 ..._original_pytorch_checkpoint_to_pytorch.py |   77 -
 ..._original_pytorch_checkpoint_to_pytorch.py |  183 ---
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  113 --
 ..._original_pytorch_checkpoint_to_pytorch.py |  212 ---
 .../models/yolos/convert_yolos_to_pytorch.py  |  267 ----
 .../yoso/convert_yoso_pytorch_to_pytorch.py   |  108 --
 .../models/zoedepth/convert_zoedepth_to_hf.py |  426 ------
 297 files changed, 53 insertions(+), 61511 deletions(-)
 delete mode 100644 src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/align/convert_align_tf_to_hf.py
 delete mode 100644 src/transformers/models/aria/convert_aria_weights_to_hf.py
 delete mode 100644 src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
 delete mode 100644 src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py
 delete mode 100644 src/transformers/models/bark/convert_suno_to_hf.py
 delete mode 100644 src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
 delete mode 100644 src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
 delete mode 100644 src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
 delete mode 100755 src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/bit/convert_bit_to_pytorch.py
 delete mode 100644 src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
 delete mode 100644 src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/bros/convert_bros_to_pytorch.py
 delete mode 100755 src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
 delete mode 100644 src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/clvp/convert_clvp_to_hf.py
 delete mode 100644 src/transformers/models/colpali/convert_colpali_weights_to_hf.py
 delete mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
 delete mode 100644 src/transformers/models/convnext/convert_convnext_to_pytorch.py
 delete mode 100644 src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
 delete mode 100644 src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/dac/convert_dac_checkpoint.py
 delete mode 100644 src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
 delete mode 100644 src/transformers/models/deit/convert_deit_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/jukebox/convert_jukebox.py
 delete mode 100644 src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/van/convert_van_to_pytorch.py
 delete mode 100644 src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
 delete mode 100644 src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/detr/convert_detr_to_pytorch.py
 delete mode 100644 src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/dinov2/convert_dinov2_to_hf.py
 delete mode 100644 src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
 delete mode 100644 src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
 delete mode 100644 src/transformers/models/donut/convert_donut_to_pytorch.py
 delete mode 100644 src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
 delete mode 100644 src/transformers/models/dpt/convert_dpt_beit_to_hf.py
 delete mode 100644 src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
 delete mode 100644 src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
 delete mode 100644 src/transformers/models/dpt/convert_dpt_to_pytorch.py
 delete mode 100644 src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
 delete mode 100644 src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/esm/convert_esm.py
 delete mode 100644 src/transformers/models/falcon/convert_custom_code_checkpoint.py
 delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_hifigan.py
 delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
 delete mode 100644 src/transformers/models/flava/convert_dalle_to_flava_codebook.py
 delete mode 100644 src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
 delete mode 100755 src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
 delete mode 100644 src/transformers/models/gemma/convert_gemma_weights_to_hf.py
 delete mode 100644 src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py
 delete mode 100644 src/transformers/models/git/convert_git_to_pytorch.py
 delete mode 100644 src/transformers/models/glm/convert_glm_weights_to_hf.py
 delete mode 100644 src/transformers/models/glpn/convert_glpn_to_pytorch.py
 delete mode 100755 src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
 delete mode 100644 src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
 delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
 delete mode 100644 src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
 delete mode 100644 src/transformers/models/hiera/convert_hiera_to_hf.py
 delete mode 100644 src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py
 delete mode 100644 src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
 delete mode 100644 src/transformers/models/ijepa/convert_ijepa_to_hf.py
 delete mode 100644 src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
 delete mode 100644 src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
 delete mode 100644 src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py
 delete mode 100644 src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/levit/convert_levit_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/llama/convert_llama_weights_to_hf.py
 delete mode 100644 src/transformers/models/llava/convert_llava_weights_to_hf.py
 delete mode 100644 src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
 delete mode 100644 src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py
 delete mode 100644 src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
 delete mode 100644 src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
 delete mode 100644 src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
 delete mode 100644 src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
 delete mode 100644 src/transformers/models/marian/convert_marian_to_pytorch.py
 delete mode 100644 src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
 delete mode 100644 src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
 delete mode 100644 src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
 delete mode 100644 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
 delete mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mistral/convert_mistral_weights_to_hf.py
 delete mode 100644 src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
 delete mode 100644 src/transformers/models/mllama/convert_mllama_weights_to_hf.py
 delete mode 100644 src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
 delete mode 100644 src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
 delete mode 100644 src/transformers/models/moshi/convert_moshi_transformers.py
 delete mode 100644 src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
 delete mode 100644 src/transformers/models/musicgen/convert_musicgen_transformers.py
 delete mode 100644 src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
 delete mode 100644 src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
 delete mode 100644 src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/nougat/convert_nougat_to_hf.py
 delete mode 100644 src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/olmo/convert_olmo_weights_to_hf.py
 delete mode 100644 src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
 delete mode 100644 src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py
 delete mode 100644 src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
 delete mode 100644 src/transformers/models/oneformer/convert_to_hf_oneformer.py
 delete mode 100755 src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/owlv2/convert_owlv2_to_hf.py
 delete mode 100644 src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
 delete mode 100644 src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
 delete mode 100644 src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
 delete mode 100644 src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
 delete mode 100644 src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
 delete mode 100644 src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
 delete mode 100644 src/transformers/models/phi/convert_phi_weights_to_hf.py
 delete mode 100644 src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
 delete mode 100644 src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
 delete mode 100644 src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
 delete mode 100644 src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
 delete mode 100644 src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/pvt/convert_pvt_to_pytorch.py
 delete mode 100644 src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
 delete mode 100644 src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
 delete mode 100755 src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
 delete mode 100644 src/transformers/models/regnet/convert_regnet_to_pytorch.py
 delete mode 100755 src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/resnet/convert_resnet_to_pytorch.py
 delete mode 100644 src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
 delete mode 100644 src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
 delete mode 100644 src/transformers/models/sam/convert_sam_to_hf.py
 delete mode 100644 src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
 delete mode 100644 src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
 delete mode 100644 src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
 delete mode 100644 src/transformers/models/seggpt/convert_seggpt_to_hf.py
 delete mode 100644 src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/siglip/convert_siglip_to_hf.py
 delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
 delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
 delete mode 100644 src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
 delete mode 100644 src/transformers/models/speecht5/convert_hifigan.py
 delete mode 100644 src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/superpoint/convert_superpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
 delete mode 100644 src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
 delete mode 100644 src/transformers/models/swin/convert_swin_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
 delete mode 100644 src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/switch_transformers/convert_big_switch.py
 delete mode 100644 src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
 delete mode 100755 src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
 delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
 delete mode 100644 src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/textnet/convert_textnet_to_hf.py
 delete mode 100644 src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
 delete mode 100644 src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
 delete mode 100644 src/transformers/models/udop/convert_udop_to_hf.py
 delete mode 100644 src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/univnet/convert_univnet.py
 delete mode 100644 src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
 delete mode 100644 src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
 delete mode 100644 src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py
 delete mode 100644 src/transformers/models/videomae/convert_videomae_to_pytorch.py
 delete mode 100644 src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
 delete mode 100644 src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
 delete mode 100644 src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/vit/convert_dino_to_pytorch.py
 delete mode 100644 src/transformers/models/vit/convert_vit_timm_to_pytorch.py
 delete mode 100644 src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
 delete mode 100644 src/transformers/models/vit_msn/convert_msn_to_pytorch.py
 delete mode 100644 src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
 delete mode 100644 src/transformers/models/vitpose/convert_vitpose_to_hf.py
 delete mode 100644 src/transformers/models/vits/convert_original_checkpoint.py
 delete mode 100644 src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
 delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
 delete mode 100644 src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/whisper/convert_openai_to_hf.py
 delete mode 100644 src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
 delete mode 100644 src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
 delete mode 100755 src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100755 src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/yolos/convert_yolos_to_pytorch.py
 delete mode 100644 src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
 delete mode 100644 src/transformers/models/zoedepth/convert_zoedepth_to_hf.py

diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index ee155e377e41..87496f95a1d2 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index 095af99efffc..590bf5a0518c 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
 
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index ddbde78f703c..a7d5c0a0c912 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -56,7 +56,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 5f1988c36de1..7652fc2355a9 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -57,7 +57,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index ef308316569b..650b088b302e 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index d42fb52d5c13..9f9f5decf6a9 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 111d8adce8b4..ddf35750d447 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -57,7 +57,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 6cbcac0a7e68..6295cb46e551 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index f23e55191709..fd876fdfcfc2 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 9d052076b7b1..654a9be30bcd 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 100a1365c2e9..9c31fe31ac0c 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
index 806330fb72d1..138f61bc4631 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -46,7 +46,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
index d888b7853dd4..9f2f2347b889 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -52,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 10bfee8f25f7..442e246246c0 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 078b0add065c..5fc24dd81bee 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index cac845f3a055..d1ea873ad9a1 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -58,7 +58,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 0a0e10511fa2..6b5673088ad5 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 8cb30099491a..db7565a21bf8 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 0bff38707d56..8bbe6fe9edad 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 20763558a5f6..0cb9abf487ef 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index f188e4e476a2..d2bee272db76 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 2d4e8bdbb92c..6c40f9ecb1d3 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index 07fcb36acb15..4ae9b39f6ea6 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
 
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index 33ad0499301e..9f3f7eefd81e 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 6de464f43670..0c7a124a10f9 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index c3e12ac9edef..27b235c7bb41 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 8e791564b007..2b98a6e11d97 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 6ccce481b548..77b3fda276df 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index c1874b3fe18e..7096bf12ba9b 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 6f77f8256417..e89272226ef0 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 16e64eb92343..8d92330f7c68 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 9eb3498c8c17..fdbbe306bd90 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index 682d3b16d216..f41a63c2b368 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index e6a643e42139..eff37d156a26 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index a2d09f200047..e557a82377da 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index ab5ab7adb19c..cd1d3e768bcf 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index dae845b119b1..a314acfc71ff 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 2a99bc42e119..7af2359bdcf2 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 8da7e86d8755..ef50dfd5916d 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index c76f83ce4def..de0157546ac6 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 056db7167280..bd7f315f8dec 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 56e3a1e646db..df793ad689a5 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index dadfcb80941e..18247f875e56 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index df4c1e9557a9..e9fd6da5f3c7 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 01c31de8730b..fc8b5f20c2b4 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 296c70549bda..4568f06bf44e 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index b35d761d8a6a..9a4faf78d84d 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index a78a5d89e19f..e83d8156e7cf 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -62,7 +62,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 92a10990d160..0c6f36fd5c8c 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index a8f2de825cc2..a5d942d2d74b 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 1afb72cf1098..bd62dbb615b7 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.48.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index 1a4d94c24e8c..c6d12f87b78c 100644
--- a/setup.py
+++ b/setup.py
@@ -437,7 +437,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.48.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.48.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2b4980306c53..d0d8babcb242 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.48.0.dev0"
+__version__ = "4.48.0"
 
 from typing import TYPE_CHECKING
 
diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index df2a22610187..000000000000
--- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALBERT checkpoint."""
-
-import argparse
-
-import torch
-
-from ...utils import logging
-from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = AlbertConfig.from_json_file(albert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = AlbertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--albert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained ALBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
deleted file mode 100644
index 610db8482f91..000000000000
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALIGN checkpoints from the original repository."""
-
-import argparse
-import os
-
-import align
-import numpy as np
-import requests
-import tensorflow as tf
-import torch
-from PIL import Image
-from tokenizer import Tokenizer
-
-from transformers import (
-    AlignConfig,
-    AlignModel,
-    AlignProcessor,
-    BertConfig,
-    BertTokenizer,
-    EfficientNetConfig,
-    EfficientNetImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def preprocess(image):
-    image = tf.image.resize(image, (346, 346))
-    image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289)
-    return image
-
-
-def get_align_config():
-    vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7")
-    vision_config.image_size = 289
-    vision_config.hidden_dim = 640
-    vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}
-    vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1}
-    vision_config.depthwise_padding = []
-
-    text_config = BertConfig()
-    config = AlignConfig.from_text_vision_configs(
-        text_config=text_config, vision_config=vision_config, projection_dim=640
-    )
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_processor():
-    image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        rescale_factor=1 / 127.5,
-        rescale_offset=True,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    tokenizer.model_max_length = 64
-    processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    return processor
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def rename_keys(original_param_names):
-    # EfficientNet image encoder
-    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
-    block_names = list(set(block_names))
-    block_names = sorted(block_names)
-    num_blocks = len(block_names)
-    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
-
-    rename_keys = []
-    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
-    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
-    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
-    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
-    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
-
-    for b in block_names:
-        hf_b = block_name_mapping[b]
-        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
-        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
-        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
-        )
-        rename_keys.append(
-            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
-        )
-        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
-        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
-        rename_keys.append(
-            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
-        )
-
-        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
-        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
-        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
-        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
-        rename_keys.append(
-            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
-        )
-        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
-        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
-        )
-
-    key_mapping = {}
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = "vision_model." + item[1]
-
-    # BERT text encoder
-    rename_keys = []
-    old = "tf_bert_model/bert"
-    new = "text_model"
-    for i in range(12):
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias")
-        )
-
-    rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight"))
-    rename_keys.append(
-        (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight")
-    )
-    rename_keys.append(
-        (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight")
-    )
-    rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight"))
-    rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias"))
-
-    rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight"))
-    rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias"))
-    rename_keys.append(("dense/kernel:0", "text_projection.weight"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("temperature:0", "temperature"))
-
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = item[1]
-    return key_mapping
-
-
-def replace_params(hf_params, tf_params, key_mapping):
-    list(hf_params.keys())
-
-    for key, value in tf_params.items():
-        if key not in key_mapping:
-            continue
-
-        hf_key = key_mapping[key]
-        if "_conv" in key and "kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
-        elif "embeddings" in key:
-            new_hf_value = torch.from_numpy(value)
-        elif "depthwise_kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
-        elif "kernel" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value))
-        elif "temperature" in key:
-            new_hf_value = value
-        elif "bn/gamma" or "bn/beta" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value)).squeeze()
-        else:
-            new_hf_value = torch.from_numpy(value)
-
-        # Replace HF parameters with original TF model parameters
-        hf_params[hf_key].copy_(new_hf_value)
-
-
-@torch.no_grad()
-def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ALIGN structure.
-    """
-    # Load original model
-    seq_length = 64
-    tok = Tokenizer(seq_length)
-    original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size())
-    original_model.compile()
-    original_model.load_weights(checkpoint_path)
-
-    tf_params = original_model.trainable_variables
-    tf_non_train_params = original_model.non_trainable_variables
-    tf_params = {param.name: param.numpy() for param in tf_params}
-    for param in tf_non_train_params:
-        tf_params[param.name] = param.numpy()
-    tf_param_names = list(tf_params.keys())
-
-    # Load HuggingFace model
-    config = get_align_config()
-    hf_model = AlignModel(config).eval()
-    hf_params = hf_model.state_dict()
-
-    # Create src-to-dst parameter name mapping dictionary
-    print("Converting parameters...")
-    key_mapping = rename_keys(tf_param_names)
-    replace_params(hf_params, tf_params, key_mapping)
-
-    # Initialize processor
-    processor = get_processor()
-    inputs = processor(
-        images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt"
-    )
-
-    # HF model inference
-    hf_model.eval()
-    with torch.no_grad():
-        outputs = hf_model(**inputs)
-
-    hf_image_features = outputs.image_embeds.detach().numpy()
-    hf_text_features = outputs.text_embeds.detach().numpy()
-
-    # Original model inference
-    original_model.trainable = False
-    tf_image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        do_rescale=False,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"]
-    text = tok(tf.constant(["A picture of a cat"]))
-
-    image_features = original_model.image_encoder(image, training=False)
-    text_features = original_model.text_encoder(text, training=False)
-
-    image_features = tf.nn.l2_normalize(image_features, axis=-1)
-    text_features = tf.nn.l2_normalize(text_features, axis=-1)
-
-    # Check whether original and HF model outputs match  -> np.allclose
-    if not np.allclose(image_features, hf_image_features, atol=1e-3):
-        raise ValueError("The predicted image features are not the same.")
-    if not np.allclose(text_features, hf_text_features, atol=1e-3):
-        raise ValueError("The predicted text features are not the same.")
-    print("Model outputs match!")
-
-    if save_model:
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and image processor
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model and image processor to hub
-        print("Pushing converted ALIGN to the hub...")
-        processor.push_to_hub("align-base")
-        hf_model.push_to_hub("align-base")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_path",
-        default="./weights/model-weights",
-        type=str,
-        help="Path to the pretrained TF ALIGN checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="hf_model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-
-    args = parser.parse_args()
-    convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py
deleted file mode 100644
index dcc9e4d13976..000000000000
--- a/src/transformers/models/aria/convert_aria_weights_to_hf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import glob
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AriaForConditionalGeneration,
-    AriaProcessor,
-    AutoConfig,
-    AutoTokenizer,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from aria.model.language_model.aria_llama import AriaTextForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "vision_tower.vision_model": "vision_tower",
-    "ln_ffn": "layer_norm",
-    "ffn": "feed_forward",
-    "ln_kv": "layer_norm_kv",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,))
-    new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,))
-
-    return new_state_dict
-
-
-def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        text_model_id,
-        extra_special_tokens={
-            "image_token": "<|img|>",
-            "pad_token": "<pad>",
-        },
-    )
-    tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-
-    processor = AriaProcessor.from_pretrained(
-        text_model_id,
-        tokenizer=tokenizer,
-    )
-
-    config = AutoConfig.from_pretrained(text_model_id)
-    config.vision_config.hidden_size = 1152
-    config.vision_config.attention_heads = 16
-    config.pad_token_id = 2
-    config.image_token_index = 9
-    config.intermediate_size = config.moe_intermediate_size
-    config.auto_map = {
-        "AutoConfig": "modeling_aria.AriaConfig",
-        "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration",
-    }
-
-    with torch.device("meta"):
-        model = AriaForConditionalGeneration(config)
-
-    state_dict = load_original_state_dict(old_state_dict_id)
-
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, strict=False, assign=True)
-
-    # print("Saving models")
-    # model.save_pretrained("local_aria", safe_serialization=False)
-    # processor.save_pretrained("local_aria")
-    print("Pushing to hub")
-    model.push_to_hub(output_hub_path, create_pr=True)
-    processor.push_to_hub(output_hub_path, create_pr=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        default="rhymes-ai/Aria",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        default="rhymes-ai/Aria",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        default="rhymes-ai/Aria",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        default="rhymes-ai/Aria",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
deleted file mode 100644
index d211ef7ab058..000000000000
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
-
-import argparse
-import json
-from pathlib import Path
-
-import torch
-import torchaudio
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-
-from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_audio_spectrogram_transformer_config(model_name):
-    config = ASTConfig()
-
-    if "10-10" in model_name:
-        pass
-    elif "speech-commands" in model_name:
-        config.max_length = 128
-    elif "12-12" in model_name:
-        config.time_stride = 12
-        config.frequency_stride = 12
-    elif "14-14" in model_name:
-        config.time_stride = 14
-        config.frequency_stride = 14
-    elif "16-16" in model_name:
-        config.time_stride = 16
-        config.frequency_stride = 16
-    else:
-        raise ValueError("Model not supported")
-
-    repo_id = "huggingface/label-files"
-    if "speech-commands" in model_name:
-        config.num_labels = 35
-        filename = "speech-commands-v2-id2label.json"
-    else:
-        config.num_labels = 527
-        filename = "audioset-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name):
-    if "module.v" in name:
-        name = name.replace("module.v", "audio_spectrogram_transformer")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "embeddings.cls_token")
-    if "dist_token" in name:
-        name = name.replace("dist_token", "embeddings.distillation_token")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    # transformer blocks
-    if "blocks" in name:
-        name = name.replace("blocks", "encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    # final layernorm
-    if "audio_spectrogram_transformer.norm" in name:
-        name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
-    # classifier head
-    if "module.mlp_head.0" in name:
-        name = name.replace("module.mlp_head.0", "classifier.layernorm")
-    if "module.mlp_head.1" in name:
-        name = name.replace("module.mlp_head.1", "classifier.dense")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            dim = config.hidden_size
-            if "weight" in key:
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
-                ] = val[dim : dim * 2, :]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
-                ] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def remove_keys(state_dict):
-    ignore_keys = [
-        "module.v.head.weight",
-        "module.v.head.bias",
-        "module.v.head_dist.weight",
-        "module.v.head_dist.bias",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-@torch.no_grad()
-def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure.
-    """
-    config = get_audio_spectrogram_transformer_config(model_name)
-
-    model_name_to_url = {
-        "ast-finetuned-audioset-10-10-0.4593": (
-            "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.450": (
-            "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.448": (
-            "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.448-v2": (
-            "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-12-12-0.447": (
-            "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-14-14-0.443": (
-            "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-16-16-0.442": (
-            "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
-        ),
-        "ast-finetuned-speech-commands-v2": (
-            "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
-        ),
-    }
-
-    # load original state_dict
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove some keys
-    remove_keys(state_dict)
-    # rename some keys
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    # load 🤗 model
-    model = ASTForAudioClassification(config)
-    model.eval()
-
-    model.load_state_dict(new_state_dict)
-
-    # verify outputs on dummy input
-    # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
-    mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
-    std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
-    max_length = 1024 if "speech-commands" not in model_name else 128
-    feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
-
-    if "speech-commands" in model_name:
-        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
-        waveform = dataset[0]["audio"]["array"]
-    else:
-        filepath = hf_hub_download(
-            repo_id="nielsr/audio-spectogram-transformer-checkpoint",
-            filename="sample_audio.flac",
-            repo_type="dataset",
-        )
-
-        waveform, _ = torchaudio.load(filepath)
-        waveform = waveform.squeeze().numpy()
-
-    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    if model_name == "ast-finetuned-audioset-10-10-0.4593":
-        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
-    elif model_name == "ast-finetuned-audioset-10-10-0.450":
-        expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
-    elif model_name == "ast-finetuned-audioset-10-10-0.448":
-        expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
-    elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
-        expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
-    elif model_name == "ast-finetuned-audioset-12-12-0.447":
-        expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
-    elif model_name == "ast-finetuned-audioset-14-14-0.443":
-        expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
-    elif model_name == "ast-finetuned-audioset-16-16-0.442":
-        expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
-    elif model_name == "ast-finetuned-speech-commands-v2":
-        expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
-    else:
-        raise ValueError("Unknown model name")
-    if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
-        raise ValueError("Logits don't match")
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and feature extractor to the hub...")
-        model.push_to_hub(f"MIT/{model_name}")
-        feature_extractor.push_to_hub(f"MIT/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ast-finetuned-audioset-10-10-0.4593",
-        type=str,
-        help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py
deleted file mode 100644
index a7b8cfc78290..000000000000
--- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed."""
-
-import argparse
-import json
-import os
-import re
-from os import path
-from typing import Dict, Union
-
-import torch
-from huggingface_hub import split_torch_state_dict_into_shards
-from safetensors.torch import save_file
-
-from transformers import AutoTokenizer
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
-
-from .configuration_bamba import BambaConfig
-
-
-def convert_state_dict_from_mamba_ssm(original_sd: Dict) -> Dict[str, torch.Tensor]:
-    state_dict = {}
-
-    for orig_k, param in original_sd.items():
-        k = orig_k.replace("backbone", "model")
-
-        # for embeddings
-        k = k.replace("embedding", "embed_tokens")
-
-        # for mixer
-        k = k.replace("mixer", "mamba")
-
-        # for final layernorm
-        k = k.replace("norm_f", "final_layernorm")
-
-        # for block layernorm
-        k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k)
-        k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k)
-
-        # for mlp
-        k = k.replace("mlp.fc2", "feed_forward.down_proj")
-
-        if "mlp.fc1" in k:
-            param, param2 = torch.chunk(param, 2, dim=0)
-            k2 = k.replace("mlp.fc1", "feed_forward.gate_proj")
-            state_dict[k2] = param2
-            k = k.replace("mlp.fc1", "feed_forward.up_proj")
-
-        if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or (
-            "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd
-        ):
-            # then this must be a mamba
-            pass
-        else:
-            # for attn
-            # - because mixer was replaced to mamba above
-            k = k.replace("mamba.out_proj", "self_attn.o_proj")
-            if "mamba.in_proj" in k:
-                m, n = param.shape
-                d = (m - n) // 2
-                param, param2, param3 = torch.split(param, [n, d, d], dim=0)
-                k2 = k.replace("mamba.in_proj", "self_attn.k_proj")
-                state_dict[k2] = param2
-                k2 = k.replace("mamba.in_proj", "self_attn.v_proj")
-                state_dict[k2] = param3
-                k = k.replace("mamba.in_proj", "self_attn.q_proj")
-
-        state_dict[k] = param
-
-    return state_dict
-
-
-# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py
-def convert_ssm_config_to_hf_config(
-    config_ssm: Dict,
-    **kwargs,
-) -> BambaConfig:
-    """Convert a config from mamba_ssm to a BambaConfig from here."""
-    hf_config: BambaConfig = BambaConfig(**kwargs)
-
-    hf_config.architectures = ["BambaForCausalLM"]
-
-    # Set important values from config and recalculate other resulting entries
-    hf_config.hidden_size = config_ssm["d_model"]
-    hf_config.intermediate_size = config_ssm["d_intermediate"]
-    hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head
-    hf_config.num_hidden_layers = config_ssm["n_layer"]
-    hf_config.tie_word_embeddings = config_ssm["tie_embeddings"]
-
-    # currently this script assumes config_ssm belongs to v2
-    if config_ssm["ssm_cfg"].get("layer") != "Mamba2":
-        raise ValueError("Conversion script only supports Mamba2")
-
-    # Set attention values
-    attn_cfg = config_ssm.get("attn_cfg")
-    if attn_cfg:
-        assert attn_cfg["causal"], "Only support non-causal attention."
-        assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias."
-        assert not attn_cfg["out_proj_bias"], "Only support no out bias."
-        hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"]
-        hf_config.num_attention_heads = attn_cfg["num_heads"]
-        hf_config.num_key_value_heads = attn_cfg["num_heads_kv"]
-
-    attention_layer_indices = config_ssm.get("attn_layer_idx")
-    if attention_layer_indices:
-        hf_config.attn_layer_indices = attention_layer_indices
-
-    # Padded vocab size, mostly of 16 but 32 is also very common in different models
-    vocab_size = config_ssm["vocab_size"]
-    pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"]
-    if (vocab_size % pad_vocab_size_multiple) != 0:
-        vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-    hf_config.vocab_size = vocab_size
-
-    return hf_config
-
-
-def save_single_safetensor(
-    state_dict: Dict,
-    save_directory: str,
-    metadata: Dict,
-):
-    save_file(
-        state_dict,
-        os.path.join(save_directory, SAFE_WEIGHTS_NAME),
-        metadata,
-    )
-
-
-def save_sharded_safetensors(
-    state_dict: Dict,
-    save_directory: str,
-    metadata: Dict,
-    max_shard_size: Union[int, str] = "5GB",
-):
-    filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace(
-        ".safetensors", "{suffix}.safetensors"
-    )
-    state_dict_split = split_torch_state_dict_into_shards(
-        state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
-    )
-    index = {
-        "metadata": state_dict_split.metadata,
-        "weight_map": state_dict_split.tensor_to_filename,
-    }
-    # Save the index
-    with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    filename_to_tensors = state_dict_split.filename_to_tensors.items()
-    for shard_file, tensors in filename_to_tensors:
-        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
-        save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
-
-
-# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py
-def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file(
-    mamba_ssm_checkpoint_path: str,
-    precision: str,
-    output_dir: str,
-    tokenizer_path: str = None,
-    save_model: Union[bool, str] = True,
-) -> None:
-    # load tokenizer if provided, this will be used to set the
-    # token_ids in the config file
-    token_ids = {}
-    if tokenizer_path:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        for key in [
-            "bos_token_id",
-            "eos_token_id",
-            "pad_token_id",
-        ]:
-            id = getattr(tokenizer, key, None)
-            if id:
-                token_ids[key] = id
-
-    # there are some configs unsettable by mamba_ssn config, so
-    # if there are changes from the defaults, have to pass them into
-    # the function
-    unsettables = {
-        "mamba_d_head": 64,
-        "mamba_d_state": 128,
-        "mamba_n_groups": 1,
-        "rms_norm_eps": 1e-5,
-    }
-
-    # Load and save config based on name
-    config_path = path.join(mamba_ssm_checkpoint_path, "config.json")
-    with open(config_path, "r", encoding="utf-8") as json_file:
-        config = json.load(json_file)
-
-    # convert the config
-    hf_config = convert_ssm_config_to_hf_config(
-        config_ssm=config,
-        **token_ids,
-        **unsettables,
-    )
-    hf_config.save_pretrained(output_dir)
-
-    # Load state dict of the original model and transfer to hf model
-    state_dict = torch.load(
-        path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"),
-        map_location="cpu",
-        weights_only=True,
-    )
-    # FIXME: allow other parameters to pass in
-    state_dict = convert_state_dict_from_mamba_ssm(state_dict)
-
-    # Save new model to pytorch_dump_path
-    dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16)
-
-    save_file_fn = None
-    if isinstance(save_model, bool) and save_model:
-        save_file_fn = save_single_safetensor
-    elif isinstance(save_model, str) and save_model == "sharded":
-        save_file_fn = save_sharded_safetensors
-
-    if save_file_fn:
-        save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"})
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba_ssm_checkpoint_directory",
-        type=str,
-        required=True,
-        help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-p",
-        "--precision",
-        type=str,
-        default="fp16",
-        const="fp16",
-        required=True,
-        choices=("fp32", "fp16", "bf16"),
-        help="The precision the model will be saved in. Select from fp32, fp16 or bf16.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    parser.add_argument(
-        "-t",
-        "--tokenizer_model_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to a the tokenizer file.",
-    )
-    args = parser.parse_args()
-
-    convert_mamba_ssm_checkpoint_file_to_huggingface_model_file(
-        args.mamba2_checkpoint_directory,
-        args.precision,
-        args.output_dir,
-    )
diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py
deleted file mode 100644
index 880debe60ae4..000000000000
--- a/src/transformers/models/bark/convert_suno_to_hf.py
+++ /dev/null
@@ -1,263 +0,0 @@
-"""Convert Bark checkpoint."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from bark.generation import _load_model as _bark_load_model
-from huggingface_hub import hf_hub_download
-
-from transformers import EncodecConfig, EncodecModel, set_seed
-from transformers.models.bark.configuration_bark import (
-    BarkCoarseConfig,
-    BarkConfig,
-    BarkFineConfig,
-    BarkSemanticConfig,
-)
-from transformers.models.bark.generation_configuration_bark import (
-    BarkCoarseGenerationConfig,
-    BarkFineGenerationConfig,
-    BarkGenerationConfig,
-    BarkSemanticGenerationConfig,
-)
-from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-set_seed(770)
-
-
-new_layer_name_dict = {
-    "c_attn": "att_proj",
-    "c_proj": "out_proj",
-    "c_fc": "in_proj",
-    "transformer.": "",
-    "h.": "layers.",
-    "ln_1": "layernorm_1",
-    "ln_2": "layernorm_2",
-    "ln_f": "layernorm_final",
-    "wpe": "position_embeds_layer",
-    "wte": "input_embeds_layer",
-}
-
-
-REMOTE_MODEL_PATHS = {
-    "text_small": {
-        "repo_id": "suno/bark",
-        "file_name": "text.pt",
-    },
-    "coarse_small": {
-        "repo_id": "suno/bark",
-        "file_name": "coarse.pt",
-    },
-    "fine_small": {
-        "repo_id": "suno/bark",
-        "file_name": "fine.pt",
-    },
-    "text": {
-        "repo_id": "suno/bark",
-        "file_name": "text_2.pt",
-    },
-    "coarse": {
-        "repo_id": "suno/bark",
-        "file_name": "coarse_2.pt",
-    },
-    "fine": {
-        "repo_id": "suno/bark",
-        "file_name": "fine_2.pt",
-    },
-}
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
-
-
-def _get_ckpt_path(model_type, use_small=False):
-    key = model_type
-    if use_small:
-        key += "_small"
-    return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
-
-
-def _download(from_hf_path, file_name):
-    os.makedirs(CACHE_DIR, exist_ok=True)
-    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
-
-
-def _load_model(ckpt_path, device, use_small=False, model_type="text"):
-    if model_type == "text":
-        ModelClass = BarkSemanticModel
-        ConfigClass = BarkSemanticConfig
-        GenerationConfigClass = BarkSemanticGenerationConfig
-    elif model_type == "coarse":
-        ModelClass = BarkCoarseModel
-        ConfigClass = BarkCoarseConfig
-        GenerationConfigClass = BarkCoarseGenerationConfig
-    elif model_type == "fine":
-        ModelClass = BarkFineModel
-        ConfigClass = BarkFineConfig
-        GenerationConfigClass = BarkFineGenerationConfig
-    else:
-        raise NotImplementedError()
-    model_key = f"{model_type}_small" if use_small else model_type
-    model_info = REMOTE_MODEL_PATHS[model_key]
-    if not os.path.exists(ckpt_path):
-        logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
-        _download(model_info["repo_id"], model_info["file_name"])
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    # this is a hack
-    model_args = checkpoint["model_args"]
-    if "input_vocab_size" not in model_args:
-        model_args["input_vocab_size"] = model_args["vocab_size"]
-        model_args["output_vocab_size"] = model_args["vocab_size"]
-        del model_args["vocab_size"]
-
-    # convert Bark model arguments to HF Bark model arguments
-    model_args["num_heads"] = model_args.pop("n_head")
-    model_args["hidden_size"] = model_args.pop("n_embd")
-    model_args["num_layers"] = model_args.pop("n_layer")
-
-    model_config = ConfigClass(**checkpoint["model_args"])
-    model = ModelClass(config=model_config)
-    model_generation_config = GenerationConfigClass()
-
-    model.generation_config = model_generation_config
-    state_dict = checkpoint["model"]
-    # fixup checkpoint
-    unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            # replace part of the key with corresponding layer name in HF implementation
-            new_k = k[len(unwanted_prefix) :]
-            for old_layer_name in new_layer_name_dict:
-                new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
-
-            state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
-    extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
-    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    model.load_state_dict(state_dict, strict=False)
-    n_params = model.num_parameters(exclude_embeddings=True)
-    val_loss = checkpoint["best_val_loss"].item()
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
-    model.eval()
-    model.to(device)
-    del checkpoint, state_dict
-
-    return model
-
-
-def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
-    if model_type not in ("text", "coarse", "fine"):
-        raise NotImplementedError()
-
-    device = "cpu"  # do conversion on cpu
-
-    ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
-    model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)
-
-    # load bark initial model
-    bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)
-
-    if model_type == "text":
-        bark_model = bark_model["model"]
-
-    if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
-        raise ValueError("initial and new models don't have the same number of parameters")
-
-    # check if same output as the bark model
-    batch_size = 5
-    sequence_length = 10
-
-    if model_type in ["text", "coarse"]:
-        vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
-        output_old_model = bark_model(vec)[0]
-
-        output_new_model_total = model(vec)
-
-        # take last logits
-        output_new_model = output_new_model_total.logits[:, [-1], :]
-
-    else:
-        prediction_codeboook_channel = 3
-        n_codes_total = 8
-        vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
-
-        output_new_model_total = model(prediction_codeboook_channel, vec)
-        output_old_model = bark_model(prediction_codeboook_channel, vec)
-
-        output_new_model = output_new_model_total.logits
-
-    # output difference should come from the difference of self-attention implementation design
-    if output_new_model.shape != output_old_model.shape:
-        raise ValueError("initial and new outputs don't have the same shape")
-    if (output_new_model - output_old_model).abs().max().item() > 1e-3:
-        raise ValueError("initial and new outputs are not equal")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_whole_bark_model(
-    semantic_path,
-    coarse_path,
-    fine_path,
-    append_text,
-    hub_path,
-    folder_path,
-):
-    pytorch_dump_folder_path = os.path.join(folder_path, append_text)
-
-    semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
-    coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
-    fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
-    codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")
-
-    semantic = BarkSemanticModel.from_pretrained(semantic_path)
-    coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
-    fineAcoustic = BarkFineModel.from_pretrained(fine_path)
-    codec = EncodecModel.from_pretrained("facebook/encodec_24khz")
-
-    bark_config = BarkConfig.from_sub_model_configs(
-        semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
-    )
-
-    bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
-        semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
-    )
-
-    bark = BarkModel(bark_config)
-
-    bark.semantic = semantic
-    bark.coarse_acoustics = coarseAcoustic
-    bark.fine_acoustics = fineAcoustic
-    bark.codec_model = codec
-
-    bark.generation_config = bark_generation_config
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument("model_type", type=str, help="text, coarse or fine.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
-
-    args = parser.parse_args()
-
-    load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index e694d96ca0df..000000000000
--- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BART checkpoint."""
-
-import argparse
-import os
-from pathlib import Path
-
-import fairseq
-import torch
-from packaging import version
-from torch import nn
-
-from transformers import (
-    BartConfig,
-    BartForConditionalGeneration,
-    BartForSequenceClassification,
-    BartModel,
-    BartTokenizer,
-)
-from transformers.utils import logging
-
-
-FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
-extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = " Hello world! cécé herlolip"
-
-mnli_rename_keys = [
-    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
-    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
-    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
-    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
-]
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def load_xsum_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
-    hub_interface.model.load_state_dict(sd["model"])
-    return hub_interface
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-@torch.no_grad()
-def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    if not os.path.exists(checkpoint_path):
-        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
-    else:
-        bart = load_xsum_checkpoint(checkpoint_path)
-
-    bart.model.upgrade_state_dict(bart.model.state_dict())
-    if hf_checkpoint_name is None:
-        hf_checkpoint_name = checkpoint_path.replace(".", "-")
-    config = BartConfig.from_pretrained(hf_checkpoint_name)
-    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
-    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    if not torch.eq(tokens, tokens2).all():
-        raise ValueError(
-            f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
-        )
-
-    if checkpoint_path == "bart.large.mnli":
-        state_dict = bart.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
-        for src, dest in mnli_rename_keys:
-            rename_key(state_dict, src, dest)
-        model = BartForSequenceClassification(config).eval()
-        model.load_state_dict(state_dict)
-        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
-        new_model_outputs = model(tokens)[0]  # logits
-    else:  # no classification heads to worry about
-        state_dict = bart.model.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-        fairseq_output = bart.extract_features(tokens)
-        if hf_checkpoint_name == "facebook/bart-large":
-            model = BartModel(config).eval()
-            model.load_state_dict(state_dict)
-            new_model_outputs = model(tokens).model[0]
-        else:
-            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
-            model.model.load_state_dict(state_dict)
-            if hasattr(model, "lm_head"):
-                model.lm_head = make_linear_from_emb(model.model.shared)
-            new_model_outputs = model.model(tokens)[0]
-
-    # Check results
-    if fairseq_output.shape != new_model_outputs.shape:
-        raise ValueError(
-            f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
-        )
-    if (fairseq_output != new_model_outputs).any().item():
-        raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
-    )
-    args = parser.parse_args()
-    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
deleted file mode 100644
index 46c72a97f495..000000000000
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BEiT checkpoints from the unilm repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    BeitConfig,
-    BeitForImageClassification,
-    BeitForMaskedImageModeling,
-    BeitForSemanticSegmentation,
-    BeitImageProcessor,
-)
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, has_lm_head=False, is_semantic=False):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", "beit.embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + shared relative position bias + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", "beit.embeddings.mask_token"),
-                (
-                    "rel_pos_bias.relative_position_bias_table",
-                    "beit.encoder.relative_position_bias.relative_position_bias_table",
-                ),
-                (
-                    "rel_pos_bias.relative_position_index",
-                    "beit.encoder.relative_position_bias.relative_position_index",
-                ),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    elif is_semantic:
-        # semantic segmentation classification heads
-        rename_keys.extend(
-            [
-                ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-                ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-                ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-                ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", "beit.pooler.layernorm.weight"),
-                ("fc_norm.bias", "beit.pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2
-
-        # relative_position bias table + index
-        if not has_lm_head:
-            # each layer has its own relative position bias
-            table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
-            index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
-
-            state_dict[
-                f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
-            ] = table
-            state_dict[
-                f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
-            ] = index
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our BEiT structure.
-    """
-
-    # define default BEiT configuration
-    config = BeitConfig()
-    has_lm_head = False
-    is_semantic = False
-    repo_id = "huggingface/label-files"
-    # set config parameters based on URL
-    if checkpoint_url[-9:-4] == "pt22k":
-        # masked image modeling
-        config.use_shared_relative_position_bias = True
-        config.use_mask_token = True
-        has_lm_head = True
-    elif checkpoint_url[-9:-4] == "ft22k":
-        # intermediate fine-tuning on ImageNet-22k
-        config.use_relative_position_bias = True
-        config.num_labels = 21841
-        filename = "imagenet-22k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        # this dataset contains 21843 labels but the model only has 21841
-        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
-        del id2label[9205]
-        del id2label[15027]
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    elif checkpoint_url[-8:-4] == "to1k":
-        # fine-tuning on ImageNet-1k
-        config.use_relative_position_bias = True
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        if "384" in checkpoint_url:
-            config.image_size = 384
-        if "512" in checkpoint_url:
-            config.image_size = 512
-    elif "ade20k" in checkpoint_url:
-        # fine-tuning
-        config.use_relative_position_bias = True
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.image_size = 640
-        is_semantic = True
-    else:
-        raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'")
-
-    # size of the architecture
-    if "base" in checkpoint_url:
-        pass
-    elif "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        if "ade20k" in checkpoint_url:
-            config.image_size = 640
-            config.out_indices = [7, 11, 15, 23]
-    else:
-        raise ValueError("Should either find 'base' or 'large' in checkpoint URL")
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)
-    state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"]
-
-    rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic)
-    if is_semantic:
-        # add prefix to decoder keys
-        for key, val in state_dict.copy().items():
-            val = state_dict.pop(key)
-            if key.startswith("backbone.fpn"):
-                key = key.replace("backbone.fpn", "fpn")
-            state_dict[key] = val
-
-    # load HuggingFace model
-    if checkpoint_url[-9:-4] == "pt22k":
-        model = BeitForMaskedImageModeling(config)
-    elif "ade20k" in checkpoint_url:
-        model = BeitForSemanticSegmentation(config)
-    else:
-        model = BeitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    if is_semantic:
-        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
-    else:
-        image_processor = BeitImageProcessor(
-            size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
-        )
-        image = prepare_img()
-
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # verify logits
-    expected_shape = torch.Size([1, 1000])
-    if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"):
-        expected_shape = torch.Size([1, 196, 8192])
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"):
-        expected_shape = torch.Size([1, 196, 8192])
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"):
-        expected_shape = torch.Size([1, 21841])
-        expected_logits = torch.tensor([2.2288, 2.4671, 0.7395])
-        expected_class_idx = 2397
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"):
-        expected_shape = torch.Size([1, 21841])
-        expected_logits = torch.tensor([1.6881, -0.2787, 0.5901])
-        expected_class_idx = 2396
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"):
-        expected_logits = torch.tensor([0.1241, 0.0798, -0.6569])
-        expected_class_idx = 285
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108])
-        expected_class_idx = 281
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"):
-        expected_logits = torch.tensor([0.4610, -0.0928, 0.2086])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"):
-        expected_shape = (1, 150, 160, 160)
-        expected_logits = torch.tensor(
-            [
-                [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
-                [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
-                [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
-            ]
-        )
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"):
-        expected_shape = (1, 150, 160, 160)
-        expected_logits = torch.tensor(
-            [
-                [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]],
-                [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]],
-                [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]],
-            ]
-        )
-    else:
-        raise ValueError("Can't verify logits as model is not supported")
-
-    if logits.shape != expected_shape:
-        raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}")
-    if not has_lm_head:
-        if is_semantic:
-            if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3):
-                raise ValueError("First elements of logits not as expected")
-        else:
-            print("Predicted class idx:", logits.argmax(-1).item())
-
-            if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3):
-                raise ValueError("First elements of logits not as expected")
-            if logits.argmax(-1).item() != expected_class_idx:
-                raise ValueError("Predicted class index not as expected")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
deleted file mode 100644
index 9dfd8da474e3..000000000000
--- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now
-deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert
-
-TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert
-weight names to the original names, so the model can be imported with Huggingface/transformer.
-
-You may adapt this script to include classification/MLM/NSP/etc. heads.
-
-Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0).
-      Models trained with never versions are not compatible with this script.
-"""
-
-import argparse
-import os
-import re
-
-import tensorflow as tf
-import torch
-
-from transformers import BertConfig, BertModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_tf2_weights_in_bert(model, tf_checkpoint_path, config):
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    layer_depth = []
-    for full_name, shape in init_vars:
-        # logger.info(f"Loading TF weight {name} with shape {shape}")
-        name = full_name.split("/")
-        if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]:
-            logger.info(f"Skipping non-model layer {full_name}")
-            continue
-        if "optimizer" in full_name:
-            logger.info(f"Skipping optimization layer {full_name}")
-            continue
-        if name[0] == "model":
-            # ignore initial 'model'
-            name = name[1:]
-        # figure out how many levels deep the name is
-        depth = 0
-        for _name in name:
-            if _name.startswith("layer_with_weights"):
-                depth += 1
-            else:
-                break
-        layer_depth.append(depth)
-        # read data
-        array = tf.train.load_variable(tf_path, full_name)
-        names.append("/".join(name))
-        arrays.append(array)
-    logger.info(f"Read a total of {len(arrays):,} layers")
-
-    # Sanity check
-    if len(set(layer_depth)) != 1:
-        raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})")
-    layer_depth = list(set(layer_depth))[0]
-    if layer_depth != 1:
-        raise ValueError(
-            "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP"
-            " heads."
-        )
-
-    # convert layers
-    logger.info("Converting weights...")
-    for full_name, array in zip(names, arrays):
-        name = full_name.split("/")
-        pointer = model
-        trace = []
-        for i, m_name in enumerate(name):
-            if m_name == ".ATTRIBUTES":
-                # variable names end with .ATTRIBUTES/VARIABLE_VALUE
-                break
-            if m_name.startswith("layer_with_weights"):
-                layer_num = int(m_name.split("-")[-1])
-                if layer_num <= 2:
-                    # embedding layers
-                    # layer_num 0: word_embeddings
-                    # layer_num 1: position_embeddings
-                    # layer_num 2: token_type_embeddings
-                    continue
-                elif layer_num == 3:
-                    # embedding LayerNorm
-                    trace.extend(["embeddings", "LayerNorm"])
-                    pointer = getattr(pointer, "embeddings")
-                    pointer = getattr(pointer, "LayerNorm")
-                elif layer_num > 3 and layer_num < config.num_hidden_layers + 4:
-                    # encoder layers
-                    trace.extend(["encoder", "layer", str(layer_num - 4)])
-                    pointer = getattr(pointer, "encoder")
-                    pointer = getattr(pointer, "layer")
-                    pointer = pointer[layer_num - 4]
-                elif layer_num == config.num_hidden_layers + 4:
-                    # pooler layer
-                    trace.extend(["pooler", "dense"])
-                    pointer = getattr(pointer, "pooler")
-                    pointer = getattr(pointer, "dense")
-            elif m_name == "embeddings":
-                trace.append("embeddings")
-                pointer = getattr(pointer, "embeddings")
-                if layer_num == 0:
-                    trace.append("word_embeddings")
-                    pointer = getattr(pointer, "word_embeddings")
-                elif layer_num == 1:
-                    trace.append("position_embeddings")
-                    pointer = getattr(pointer, "position_embeddings")
-                elif layer_num == 2:
-                    trace.append("token_type_embeddings")
-                    pointer = getattr(pointer, "token_type_embeddings")
-                else:
-                    raise ValueError(f"Unknown embedding layer with name {full_name}")
-                trace.append("weight")
-                pointer = getattr(pointer, "weight")
-            elif m_name == "_attention_layer":
-                # self-attention layer
-                trace.extend(["attention", "self"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "self")
-            elif m_name == "_attention_layer_norm":
-                # output attention norm
-                trace.extend(["attention", "output", "LayerNorm"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "LayerNorm")
-            elif m_name == "_attention_output_dense":
-                # output attention dense
-                trace.extend(["attention", "output", "dense"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_dense":
-                # output dense
-                trace.extend(["output", "dense"])
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_layer_norm":
-                # output dense
-                trace.extend(["output", "LayerNorm"])
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "LayerNorm")
-            elif m_name == "_key_dense":
-                # attention key
-                trace.append("key")
-                pointer = getattr(pointer, "key")
-            elif m_name == "_query_dense":
-                # attention query
-                trace.append("query")
-                pointer = getattr(pointer, "query")
-            elif m_name == "_value_dense":
-                # attention value
-                trace.append("value")
-                pointer = getattr(pointer, "value")
-            elif m_name == "_intermediate_dense":
-                # attention intermediate dense
-                trace.extend(["intermediate", "dense"])
-                pointer = getattr(pointer, "intermediate")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_layer_norm":
-                # output layer norm
-                trace.append("output")
-                pointer = getattr(pointer, "output")
-            # weights & biases
-            elif m_name in ["bias", "beta"]:
-                trace.append("bias")
-                pointer = getattr(pointer, "bias")
-            elif m_name in ["kernel", "gamma"]:
-                trace.append("weight")
-                pointer = getattr(pointer, "weight")
-            else:
-                logger.warning(f"Ignored {m_name}")
-        # for certain layers reshape is necessary
-        trace = ".".join(trace)
-        if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match(
-            r"(\S+)\.attention\.output\.dense\.weight", trace
-        ):
-            array = array.reshape(pointer.data.shape)
-        if "kernel" in full_name:
-            array = array.transpose()
-        if pointer.shape == array.shape:
-            pointer.data = torch.from_numpy(array)
-        else:
-            raise ValueError(
-                f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:"
-                f" {array.shape}"
-            )
-        logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}")
-    return model
-
-
-def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path):
-    # Instantiate model
-    logger.info(f"Loading model based on config from {config_path}...")
-    config = BertConfig.from_json_file(config_path)
-    model = BertModel(config)
-
-    # Load weights from checkpoint
-    logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...")
-    load_tf2_weights_in_bert(model, tf_checkpoint_path, config)
-
-    # Save pytorch-model
-    logger.info(f"Saving PyTorch model to {pytorch_dump_path}...")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        type=str,
-        required=True,
-        help="The config json file corresponding to the BERT model. This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model (must include filename).",
-    )
-    args = parser.parse_args()
-    convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index be904ddd7e6c..000000000000
--- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = BertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
deleted file mode 100644
index f7cb149053a3..000000000000
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
-
-import argparse
-import os
-
-import numpy as np
-import tensorflow as tf
-import torch
-
-from transformers import BertModel
-
-
-def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
-    """
-    Args:
-        model: BertModel Pytorch model instance to be converted
-        ckpt_dir: Tensorflow model directory
-        model_name: model name
-
-    Currently supported HF models:
-
-        - Y BertModel
-        - N BertForMaskedLM
-        - N BertForPreTraining
-        - N BertForMultipleChoice
-        - N BertForNextSentencePrediction
-        - N BertForSequenceClassification
-        - N BertForQuestionAnswering
-    """
-
-    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
-
-    var_map = (
-        ("layer.", "layer_"),
-        ("word_embeddings.weight", "word_embeddings"),
-        ("position_embeddings.weight", "position_embeddings"),
-        ("token_type_embeddings.weight", "token_type_embeddings"),
-        (".", "/"),
-        ("LayerNorm/weight", "LayerNorm/gamma"),
-        ("LayerNorm/bias", "LayerNorm/beta"),
-        ("weight", "kernel"),
-    )
-
-    if not os.path.isdir(ckpt_dir):
-        os.makedirs(ckpt_dir)
-
-    state_dict = model.state_dict()
-
-    def to_tf_var_name(name: str):
-        for patt, repl in iter(var_map):
-            name = name.replace(patt, repl)
-        return f"bert/{name}"
-
-    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
-        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
-        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
-        session.run(tf.variables_initializer([tf_var]))
-        session.run(tf_var)
-        return tf_var
-
-    tf.reset_default_graph()
-    with tf.Session() as session:
-        for var_name in state_dict:
-            tf_name = to_tf_var_name(var_name)
-            torch_tensor = state_dict[var_name].numpy()
-            if any(x in var_name for x in tensors_to_transpose):
-                torch_tensor = torch_tensor.T
-            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
-            tf_var.assign(tf.cast(torch_tensor, tf_var.dtype))
-            tf_weight = session.run(tf_var)
-            print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}")
-
-        saver = tf.train.Saver(tf.trainable_variables())
-        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
-
-
-def main(raw_args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased")
-    parser.add_argument(
-        "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
-    )
-    parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
-    parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
-    args = parser.parse_args(raw_args)
-
-    model = BertModel.from_pretrained(
-        pretrained_model_name_or_path=args.model_name,
-        state_dict=torch.load(args.pytorch_model_path),
-        cache_dir=args.cache_dir,
-    )
-
-    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
deleted file mode 100644
index cba1e1a2c3f7..000000000000
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT
-model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository:
-
-https://github.com/tensorflow/models/tree/master/official/projects/token_dropping
-"""
-
-import argparse
-
-import tensorflow as tf
-import torch
-
-from transformers import BertConfig, BertForMaskedLM
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertPooler,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str):
-    def get_masked_lm_array(name: str):
-        full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_array(name: str):
-        full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_layer_array(layer_index: int, name: str):
-        full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape):
-        full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-        array = array.reshape(orginal_shape)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    print(f"Loading model based on config from {config_path}...")
-    config = BertConfig.from_json_file(config_path)
-    model = BertForMaskedLM(config)
-
-    # Layers
-    for layer_index in range(0, config.num_hidden_layers):
-        layer: BertLayer = model.bert.encoder.layer[layer_index]
-
-        # Self-attention
-        self_attn: BertSelfAttention = layer.attention.self
-
-        self_attn.query.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape
-        )
-        self_attn.query.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_query_dense/bias", self_attn.query.bias.data.shape
-        )
-        self_attn.key.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape
-        )
-        self_attn.key.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_key_dense/bias", self_attn.key.bias.data.shape
-        )
-        self_attn.value.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape
-        )
-        self_attn.value.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_value_dense/bias", self_attn.value.bias.data.shape
-        )
-
-        # Self-attention Output
-        self_output: BertSelfOutput = layer.attention.output
-
-        self_output.dense.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape
-        )
-        self_output.dense.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_output_dense/bias", self_output.dense.bias.data.shape
-        )
-
-        self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma")
-        self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta")
-
-        # Intermediate
-        intermediate: BertIntermediate = layer.intermediate
-
-        intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel")
-        intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias")
-
-        # Output
-        bert_output: BertOutput = layer.output
-
-        bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel")
-        bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias")
-
-        bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma")
-        bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta")
-
-    # Embeddings
-    model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings")
-    model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings")
-    model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma")
-    model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta")
-
-    # LM Head
-    lm_head = model.cls.predictions.transform
-
-    lm_head.dense.weight.data = get_masked_lm_array("dense/kernel")
-    lm_head.dense.bias.data = get_masked_lm_array("dense/bias")
-
-    lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma")
-    lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta")
-
-    model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table")
-
-    # Pooling
-    model.bert.pooler = BertPooler(config=config)
-    model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel")
-    model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias")
-
-    # Export final model
-    model.save_pretrained(pytorch_dump_path)
-
-    # Integration test - should load without any errors ;)
-    new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
-    print(new_model.eval())
-
-    print("Model conversion was done sucessfully!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        type=str,
-        required=True,
-        help="The config json file corresponding to the BERT model. This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-    convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 0b8e6590f937..000000000000
--- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BigBird checkpoint."""
-
-import argparse
-
-from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa):
-    # Initialise PyTorch model
-    config = BigBirdConfig.from_json_file(big_bird_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-
-    if is_trivia_qa:
-        model = BigBirdForQuestionAnswering(config)
-    else:
-        model = BigBirdForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--big_bird_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa
-    )
diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
deleted file mode 100644
index e17369e48041..000000000000
--- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from typing import Dict
-
-import tensorflow as tf
-import torch
-from tqdm import tqdm
-
-from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration
-
-
-INIT_COMMON = [
-    # tf -> hf
-    ("/", "."),
-    ("layer_", "layers."),
-    ("kernel", "weight"),
-    ("beta", "bias"),
-    ("gamma", "weight"),
-    ("pegasus", "model"),
-]
-END_COMMON = [
-    (".output.dense", ".fc2"),
-    ("intermediate.LayerNorm", "final_layer_norm"),
-    ("intermediate.dense", "fc1"),
-]
-
-DECODER_PATTERNS = (
-    INIT_COMMON
-    + [
-        ("attention.self.LayerNorm", "self_attn_layer_norm"),
-        ("attention.output.dense", "self_attn.out_proj"),
-        ("attention.self", "self_attn"),
-        ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"),
-        ("attention.encdec_output.dense", "encoder_attn.out_proj"),
-        ("attention.encdec", "encoder_attn"),
-        ("key", "k_proj"),
-        ("value", "v_proj"),
-        ("query", "q_proj"),
-        ("decoder.LayerNorm", "decoder.layernorm_embedding"),
-    ]
-    + END_COMMON
-)
-
-REMAINING_PATTERNS = (
-    INIT_COMMON
-    + [
-        ("embeddings.word_embeddings", "shared.weight"),
-        ("embeddings.position_embeddings", "embed_positions.weight"),
-        ("attention.self.LayerNorm", "self_attn_layer_norm"),
-        ("attention.output.dense", "self_attn.output"),
-        ("attention.self", "self_attn.self"),
-        ("encoder.LayerNorm", "encoder.layernorm_embedding"),
-    ]
-    + END_COMMON
-)
-
-KEYS_TO_IGNORE = [
-    "encdec/key/bias",
-    "encdec/query/bias",
-    "encdec/value/bias",
-    "self/key/bias",
-    "self/query/bias",
-    "self/value/bias",
-    "encdec_output/dense/bias",
-    "attention/output/dense/bias",
-]
-
-
-def rename_state_dict_key(k, patterns):
-    for tf_name, hf_name in patterns:
-        k = k.replace(tf_name, hf_name)
-    return k
-
-
-def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration:
-    cfg = BigBirdPegasusConfig(**config_update)
-    torch_model = BigBirdPegasusForConditionalGeneration(cfg)
-    state_dict = torch_model.state_dict()
-    mapping = {}
-
-    # separating decoder weights
-    decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")}
-    remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")}
-
-    for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"):
-        conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
-        if any(conditions):
-            continue
-        patterns = DECODER_PATTERNS
-        new_k = rename_state_dict_key(k, patterns)
-        if new_k not in state_dict:
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
-            v = v.T
-        mapping[new_k] = torch.from_numpy(v)
-        assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
-
-    for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"):
-        conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
-        if any(conditions):
-            continue
-        patterns = REMAINING_PATTERNS
-        new_k = rename_state_dict_key(k, patterns)
-        if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings":
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
-            v = v.T
-        mapping[new_k] = torch.from_numpy(v)
-        if k != "pegasus/embeddings/position_embeddings":
-            assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
-
-    mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"]
-    mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight")
-    missing, extra = torch_model.load_state_dict(mapping, strict=False)
-    unexpected_missing = [
-        k
-        for k in missing
-        if k
-        not in [
-            "final_logits_bias",
-            "model.encoder.embed_tokens.weight",
-            "model.decoder.embed_tokens.weight",
-            "lm_head.weight",
-        ]
-    ]
-    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
-    assert extra == [], f"no matches found for the following tf keys {extra}"
-    return torch_model
-
-
-def get_tf_weights_as_numpy(path) -> Dict:
-    init_vars = tf.train.list_variables(path)
-    tf_weights = {}
-    ignore_name = ["global_step"]
-    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any(pat in name for pat in ignore_name)
-        if skip_key:
-            continue
-        array = tf.train.load_variable(path, name)
-        tf_weights[name] = array
-    return tf_weights
-
-
-def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict):
-    tf_weights = get_tf_weights_as_numpy(ckpt_path)
-    torch_model = convert_bigbird_pegasus(tf_weights, config_update)
-    torch_model.save_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
-    parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    config_update = {}
-    convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update)
diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index c930a850462c..000000000000
--- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import json
-import os
-import re
-import shutil
-
-import torch
-
-from transformers import BioGptConfig, BioGptForCausalLM
-from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES
-from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from transformers.utils import WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_warning()
-
-json_indent = 2
-
-
-# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18
-class Dictionary:
-    """A mapping from symbols to consecutive integers"""
-
-    def __init__(
-        self,
-        *,  # begin keyword-only arguments
-        bos="<s>",
-        pad="<pad>",
-        eos="</s>",
-        unk="<unk>",
-        extra_special_symbols=None,
-    ):
-        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
-        self.symbols = []
-        self.count = []
-        self.indices = {}
-        self.bos_index = self.add_symbol(bos)
-        self.pad_index = self.add_symbol(pad)
-        self.eos_index = self.add_symbol(eos)
-        self.unk_index = self.add_symbol(unk)
-        if extra_special_symbols:
-            for s in extra_special_symbols:
-                self.add_symbol(s)
-        self.nspecial = len(self.symbols)
-
-    def __eq__(self, other):
-        return self.indices == other.indices
-
-    def __getitem__(self, idx):
-        if idx < len(self.symbols):
-            return self.symbols[idx]
-        return self.unk_word
-
-    def __len__(self):
-        """Returns the number of symbols in the dictionary"""
-        return len(self.symbols)
-
-    def __contains__(self, sym):
-        return sym in self.indices
-
-    @classmethod
-    def load(cls, f):
-        """Loads the dictionary from a text file with the format:
-
-        ```
-        <symbol0> <count0>
-        <symbol1> <count1>
-        ...
-        ```
-        """
-        d = cls()
-        d.add_from_file(f)
-        return d
-
-    def add_symbol(self, word, n=1, overwrite=False):
-        """Adds a word to the dictionary"""
-        if word in self.indices and not overwrite:
-            idx = self.indices[word]
-            self.count[idx] = self.count[idx] + n
-            return idx
-        else:
-            idx = len(self.symbols)
-            self.indices[word] = idx
-            self.symbols.append(word)
-            self.count.append(n)
-            return idx
-
-    def _load_meta(self, lines):
-        return 0
-
-    def add_from_file(self, f):
-        """
-        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
-        """
-        if isinstance(f, str):
-            try:
-                with open(f, "r", encoding="utf-8") as fd:
-                    self.add_from_file(fd)
-            except FileNotFoundError as fnfe:
-                raise fnfe
-            except UnicodeError:
-                raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f))
-            return
-
-        lines = f.readlines()
-        indices_start_line = self._load_meta(lines)
-
-        for line in lines[indices_start_line:]:
-            try:
-                line, field = line.rstrip().rsplit(" ", 1)
-                if field == "#fairseq:overwrite":
-                    overwrite = True
-                    line, field = line.rsplit(" ", 1)
-                else:
-                    overwrite = False
-                count = int(field)
-                word = line
-                if word in self and not overwrite:
-                    raise RuntimeError(
-                        "Duplicate word found when loading Dictionary: '{}'. "
-                        "Duplicate words can overwrite earlier ones by adding the "
-                        "#fairseq:overwrite flag at the end of the corresponding row "
-                        "in the dictionary file. If using the Camembert model, please "
-                        "download an updated copy of the model file.".format(word)
-                    )
-                self.add_symbol(word, n=count, overwrite=overwrite)
-            except ValueError:
-                raise ValueError("Incorrect dictionary format, expected '<token> <cnt> [flags]'")
-
-
-def rewrite_dict_keys(d):
-    # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up,
-    # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er</w>': 7}
-    d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
-    keep_keys = "<s> <pad> </s> <unk>".split()
-    # restore the special tokens
-    for k in keep_keys:
-        del d2[f"{k}</w>"]
-        d2[k] = d[k]  # restore
-    return d2
-
-
-def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
-    # prep
-    if not os.path.exists(biogpt_checkpoint_path):
-        raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    print(f"Writing results to {pytorch_dump_folder_path}")
-
-    # handle various types of models
-
-    checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt")
-    if not os.path.isfile(checkpoint_file):
-        raise ValueError(f"path to the file {checkpoint_file} does not exist!")
-    chkpt = torch.load(checkpoint_file, map_location="cpu")
-
-    args = chkpt["cfg"]["model"]
-
-    # dicts
-    dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt")
-    if not os.path.isfile(dict_file):
-        raise ValueError(f"path to the file {dict_file} does not exist!")
-    src_dict = Dictionary.load(dict_file)
-    src_vocab = rewrite_dict_keys(src_dict.indices)
-    src_vocab_size = len(src_vocab)
-    src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"])
-    print(f"Generating {src_vocab_file} of {src_vocab_size} records")
-    with open(src_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
-
-    # merges_file (bpecodes)
-    bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes")
-    if not os.path.isfile(bpecodes_file):
-        raise ValueError(f"path to the file {bpecodes_file} does not exist!")
-
-    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
-    shutil.copyfile(bpecodes_file, merges_file)
-
-    # model config
-    biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
-
-    model_conf = {
-        "activation_dropout": args["activation_dropout"],
-        "architectures": ["BioGptForCausalLM"],
-        "attention_probs_dropout_prob": args["attention_dropout"],
-        "bos_token_id": 0,
-        "eos_token_id": 2,
-        "hidden_act": args["activation_fn"],
-        "hidden_dropout_prob": args["dropout"],
-        "hidden_size": args["decoder_embed_dim"],
-        "initializer_range": 0.02,
-        "intermediate_size": args["decoder_ffn_embed_dim"],
-        "layer_norm_eps": 1e-12,
-        "layerdrop": args["decoder_layerdrop"],
-        "max_position_embeddings": args["max_target_positions"],
-        "model_type": "biogpt",
-        "num_attention_heads": args["decoder_attention_heads"],
-        "num_hidden_layers": args["decoder_layers"],
-        "pad_token_id": 1,
-        "scale_embedding": not args["no_scale_embedding"],
-        "tie_word_embeddings": args["share_decoder_input_output_embed"],
-        "vocab_size": src_vocab_size,
-    }
-
-    # good hparam defaults to start with
-
-    print(f"Generating {biogpt_model_config_file}")
-    with open(biogpt_model_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
-
-    # tokenizer config
-    biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
-
-    tokenizer_conf = {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "model_max_length": 1024,
-        "pad_token": "<pad>",
-        "special_tokens_map_file": None,
-        "tokenizer_class": "BioGptTokenizer",
-        "unk_token": "<unk>",
-    }
-
-    print(f"Generating {biogpt_tokenizer_config_file}")
-    with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
-
-    # model
-    model_state_dict = chkpt["model"]
-
-    # remove unneeded keys
-    ignore_keys = [
-        "decoder.version",
-    ]
-    for k in ignore_keys:
-        model_state_dict.pop(k, None)
-
-    layer_names = list(model_state_dict.keys())
-    for layer_name in layer_names:
-        if layer_name.endswith("output_projection.weight"):
-            model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name)
-        else:
-            model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name)
-
-    config = BioGptConfig.from_pretrained(pytorch_dump_folder_path)
-    model_new = BioGptForCausalLM(config)
-
-    # check that it loads ok
-    model_new.load_state_dict(model_state_dict)
-
-    # save
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    print(f"Generating {pytorch_weights_dump_path}")
-    torch.save(model_state_dict, pytorch_weights_dump_path)
-
-    print("Conversion is done!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--biogpt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
-            " bpecodes, etc."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
deleted file mode 100644
index abc24290ab26..000000000000
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BiT checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm import create_model
-from timm.data import resolve_data_config
-from timm.data.transforms_factory import create_transform
-
-from transformers import BitConfig, BitForImageClassification, BitImageProcessor
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_config(model_name):
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    conv_layer = "std_conv" if "bit" in model_name else False
-
-    # note that when using BiT as backbone for ViT-hybrid checkpoints,
-    # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same",
-    # config.conv_layer = "std_conv_same"
-    config = BitConfig(
-        conv_layer=conv_layer,
-        num_labels=1000,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-def rename_key(name):
-    if "stem.conv" in name:
-        name = name.replace("stem.conv", "bit.embedder.convolution")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "head.fc" in name:
-        name = name.replace("head.fc", "classifier.1")
-    if name.startswith("norm"):
-        name = "bit." + name
-    if "bit" not in name and "classifier" not in name:
-        name = "bit.encoder." + name
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our BiT structure.
-    """
-
-    # define default BiT configuration
-    config = get_config(model_name)
-
-    # load original model from timm
-    timm_model = create_model(model_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model
-    state_dict = timm_model.state_dict()
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val.squeeze() if "head" in key else val
-
-    # load HuggingFace model
-    model = BitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # create image processor
-    transform = create_transform(**resolve_data_config({}, model=timm_model))
-    timm_transforms = transform.transforms
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    processor = BitImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": timm_transforms[0].size},
-        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
-        do_center_crop=True,
-        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
-        do_normalize=True,
-        image_mean=timm_transforms[-1].mean.tolist(),
-        image_std=timm_transforms[-1].std.tolist(),
-    )
-
-    image = prepare_img()
-    timm_pixel_values = transform(image).unsqueeze(0)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # verify pixel values
-    assert torch.allclose(timm_pixel_values, pixel_values)
-
-    # verify logits
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print("Logits:", logits[0, :3])
-    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-    timm_logits = timm_model(pixel_values)
-    assert timm_logits.shape == outputs.logits.shape
-    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model {model_name} and processor to the hub")
-        model.push_to_hub(f"ybelkada/{model_name}")
-        processor.push_to_hub(f"ybelkada/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="resnetv2_50x1_bitm",
-        type=str,
-        help="Name of the BiT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c5919b94d42f..000000000000
--- a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Blenderbot checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-PATTERNS = [
-    ["attention", "attn"],
-    ["encoder_attention", "encoder_attn"],
-    ["q_lin", "q_proj"],
-    ["k_lin", "k_proj"],
-    ["v_lin", "v_proj"],
-    ["out_lin", "out_proj"],
-    ["norm_embeddings", "layernorm_embedding"],
-    ["position_embeddings", "embed_positions"],
-    ["embeddings", "embed_tokens"],
-    ["ffn.lin", "fc"],
-]
-
-
-def rename_state_dict_key(k):
-    if k == "embeddings.weight":
-        return "shared.weight"
-
-    for parlai_name, hf_name in PATTERNS:
-        k = k.replace(parlai_name, hf_name)
-
-    if k.startswith("encoder"):
-        k = k.replace(".attn", ".self_attn")
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "final_layer_norm")
-    elif k.startswith("decoder"):
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "encoder_attn_layer_norm")
-        k = k.replace("norm3", "final_layer_norm")
-    return k
-
-
-def rename_layernorm_keys(sd):
-    keys = [
-        "model.encoder.layernorm_embedding.weight",
-        "model.encoder.layernorm_embedding.bias",
-        "model.decoder.layernorm_embedding.weight",
-        "model.decoder.layernorm_embedding.bias",
-    ]
-    for k in keys:
-        v = sd.pop(k)
-        new_k = k.replace("layernorm_embedding", "layer_norm")
-        assert new_k not in sd
-        sd[new_k] = v
-
-
-IGNORE_KEYS = ["START"]
-
-
-@torch.no_grad()
-def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    model = torch.load(checkpoint_path, map_location="cpu")
-    sd = model["model"]
-    cfg = BlenderbotConfig.from_json_file(config_json_path)
-    m = BlenderbotForConditionalGeneration(cfg)
-    valid_keys = m.model.state_dict().keys()
-    failures = []
-    mapping = {}
-    for k, v in sd.items():
-        if k in IGNORE_KEYS:
-            continue
-
-        new_k = rename_state_dict_key(k)
-        if new_k not in valid_keys:
-            failures.append([k, new_k])
-        else:
-            mapping[new_k] = v
-    if cfg.normalize_before:  # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm
-        rename_layernorm_keys(sd)
-    m.model.load_state_dict(mapping, strict=True)
-    m.half()
-    m.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
-    parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
-    parser.add_argument(
-        "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
-    )
-    args = parser.parse_args()
-    convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
deleted file mode 100644
index 3de18c294ae8..000000000000
--- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-import requests
-import torch
-
-# git clone https://github.com/salesforce/BLIP.git
-from models.blip import blip_decoder
-from models.blip_itm import blip_itm
-from models.blip_vqa import blip_vqa
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
-from transformers import (
-    BertTokenizer,
-    BlipConfig,
-    BlipForConditionalGeneration,
-    BlipForImageTextRetrieval,
-    BlipForQuestionAnswering,
-)
-
-
-def load_demo_image(image_size, device):
-    img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-
-    transform = transforms.Compose(
-        [
-            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-        ]
-    )
-    image = transform(raw_image).unsqueeze(0).to(device)
-    return image
-
-
-def rename_key(key):
-    if "visual_encoder" in key:
-        key = re.sub("visual_encoder*", "vision_model.encoder", key)
-    if "blocks" in key:
-        key = re.sub(r"blocks", "layers", key)
-    if "attn" in key:
-        key = re.sub(r"attn", "self_attn", key)
-    if "norm1" in key:
-        key = re.sub(r"norm1", "layer_norm1", key)
-    if "norm2" in key:
-        key = re.sub(r"norm2", "layer_norm2", key)
-    if "encoder.norm" in key:
-        key = re.sub(r"encoder.norm", "post_layernorm", key)
-    if "encoder.patch_embed.proj" in key:
-        key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key)
-
-    if "encoder.pos_embed" in key:
-        key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key)
-    if "encoder.cls_token" in key:
-        key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key)
-
-    if "self_attn" in key:
-        key = re.sub(r"self_attn.proj", "self_attn.projection", key)
-
-    return key
-
-
-@torch.no_grad()
-def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = BlipConfig.from_pretrained(config_path)
-    else:
-        config = BlipConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = BlipForConditionalGeneration(config).eval()
-
-    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
-
-    pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
-    pt_model = pt_model.eval()
-
-    modified_state_dict = pt_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_model.load_state_dict(modified_state_dict)
-
-    image_size = 384
-    image = load_demo_image(image_size=image_size, device="cpu")
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    input_ids = tokenizer(["a picture of"]).input_ids
-
-    out = hf_model.generate(image, input_ids)
-
-    assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
-
-    out = hf_model.generate(image)
-
-    assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
-
-    if pytorch_dump_folder_path is not None:
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'
-    model_url = (
-        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
-    )
-
-    vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
-    vqa_model.eval()
-
-    modified_state_dict = vqa_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_vqa_model = BlipForQuestionAnswering(config)
-
-    hf_vqa_model.load_state_dict(modified_state_dict)
-
-    question = ["How many dogs are in this image?"]
-    question_input_ids = tokenizer(question, return_tensors="pt").input_ids
-
-    answer = hf_vqa_model.generate(question_input_ids, image)
-    print(tokenizer.decode(answer[0]))
-
-    assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"
-    if pytorch_dump_folder_path is not None:
-        hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")
-
-    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
-
-    itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
-    itm_model.eval()
-
-    modified_state_dict = itm_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_itm_model = BlipForImageTextRetrieval(config)
-
-    question = ["A picture of a woman with a dog sitting in a beach"]
-    question_input_ids = tokenizer(
-        question,
-        return_tensors="pt",
-        padding="max_length",
-        truncation=True,
-        max_length=35,
-    ).input_ids
-
-    hf_itm_model.load_state_dict(modified_state_dict)
-    hf_itm_model.eval()
-
-    out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
-    out = hf_itm_model(question_input_ids, image, use_itm_head=False)
-
-    assert out[0].item() == 0.2110687494277954
-    assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127
-
-    if pytorch_dump_folder_path is not None:
-        hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
deleted file mode 100644
index d6640045b80c..000000000000
--- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert BLIP-2 checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32
-# to make sure we can compare both original and HF implementation in float32
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BertTokenizer,
-    Blip2Config,
-    Blip2ForConditionalGeneration,
-    Blip2ForImageTextRetrieval,
-    Blip2Processor,
-    Blip2QFormerConfig,
-    Blip2VisionConfig,
-    BlipImageProcessor,
-    OPTConfig,
-    T5Config,
-    set_seed,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, model_name):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias"))
-    if "itm" in model_name:
-        rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"))
-        rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"))
-        rename_keys.append(("vision_proj.weight", "vision_projection.weight"))
-        rename_keys.append(("vision_proj.bias", "vision_projection.bias"))
-        rename_keys.append(("text_proj.weight", "text_projection.weight"))
-        rename_keys.append(("text_proj.bias", "text_projection.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name, eos_token_id):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = Blip2VisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "opt-2.7b" in model_name:
-        text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict()
-    elif "opt-6.7b" in model_name:
-        text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict()
-    elif "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "itm" in model_name:
-        text_config = {}
-    else:
-        raise ValueError("Model name not supported")
-
-    if "itm" in model_name:
-        config = Blip2Config(
-            vision_config=vision_config,
-            qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(),
-        )
-    else:
-        config = Blip2Config(vision_config=vision_config, text_config=text_config)
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(
-    model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu"
-):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    if "opt" in model_name:
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b")
-    elif "itm" in model_name:
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
-        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-    else:
-        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
-
-    if "itm" in model_name:
-        eos_token_id = None
-    else:
-        eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0]
-    config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id)
-
-    if "itm" in model_name:
-        hf_model = Blip2ForImageTextRetrieval(config).eval()
-    else:
-        hf_model = Blip2ForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"),
-        "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"),
-        "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"),
-        "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"),
-        "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"),
-        "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"),
-        "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"),
-        "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"),
-        "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config, model_name)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "opt_proj" in key:
-            key = key.replace("opt_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("opt"):
-            key = key.replace("opt", "language")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
-    assert len(missing_keys) == 0
-
-    if "itm" in model_name:
-        unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys))
-        assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"]
-    else:
-        assert unexpected_keys == ["qformer.embeddings.position_ids"]
-
-    image = load_demo_image()
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer)
-    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device))
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-
-    if "itm" in model_name:
-        caption = "a large fountain spewing water into the air"
-        input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device)
-        attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device)
-
-        with torch.no_grad():
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [caption]}, match_head="itm"
-            )
-            logits = hf_model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                use_image_text_matching_head=True,
-            )
-
-        assert original_logits.shape == logits.logits_per_image.shape
-        print("First values of original logits:", original_logits[0, :3])
-        print("First values of HF logits:", logits.logits_per_image[0, :3])
-
-        # assert values
-        # cast to same type
-        target_dtype = logits.logits_per_image.dtype
-        assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4)
-
-        original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1)
-        itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1)
-        assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4)
-        print("Looks ok!")
-
-        with torch.no_grad():
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [caption]}, match_head="itc"
-            )
-            logits = hf_model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                use_image_text_matching_head=False,
-            )
-
-        assert original_logits.shape == logits.logits_per_image.shape
-        print("First values of original logits:", original_logits[0, :3])
-        print("First values of HF logits:", logits.logits_per_image[0, :3])
-
-        # assert values
-        # cast to same type
-        target_dtype = logits.logits_per_image.dtype
-        assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4)
-        print("Looks ok!")
-
-    else:
-        input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device)
-
-        with torch.no_grad():
-            if "opt" in model_name:
-                original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits
-                logits = hf_model(pixel_values, input_ids).logits
-            else:
-                original_logits = original_model(
-                    {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]}
-                ).logits
-                labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100)
-                logits = hf_model(pixel_values, input_ids, labels=labels).logits
-
-        assert original_logits.shape == logits.shape
-        print("First values of original logits:", original_logits[0, :3, :3])
-        print("First values of HF logits:", logits[0, :3, :3])
-
-        # assert values
-        assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4)
-        print("Looks ok!")
-
-        print("Generating a caption...")
-        prompt = "Question: what object is in this image? Answer:"
-        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device)
-
-        set_seed(42)
-
-        original_outputs = original_model.generate(
-            {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50
-        )
-        outputs = hf_model.generate(
-            pixel_values,
-            input_ids,
-            do_sample=True,
-            num_beams=5,
-            max_length=30,
-            min_length=1,
-            top_p=0.9,
-            repetition_penalty=1.0,
-            length_penalty=1.0,
-            temperature=1,
-        )
-        output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-        output_text = [text.strip() for text in output_text]
-        print("Original generation:", original_outputs)
-        print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"nielsr/{model_name}")
-        hf_model.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "blip2-opt-2.7b",
-        "blip2-opt-6.7b",
-        "blip2-opt-2.7b-coco",
-        "blip2-opt-6.7b-coco",
-        "blip2-flan-t5-xl",
-        "blip2-flan-t5-xl-coco",
-        "blip2-flan-t5-xxl",
-        "blip2-itm-vit-g",
-        "blip2-itm-vit-g-coco",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="blip2-opt-2.7b",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-    # note: this script is tested on 2 GPUs, as models are compared in float32,
-    # which requires quite some memory. Hence loading both on a
-    # separate device is the easiest to compare
-    parser.add_argument(
-        "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument(
-        "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(
-        args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device
-    )
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 40ba6240d3e4..000000000000
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BigScience BLOOM checkpoint."""
-
-import argparse
-import json
-import os
-import re
-
-import torch
-
-from transformers import BloomConfig, BloomModel
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-WEIGHTS_TO_AVERAGE_ENDSWITH = [
-    "word_embeddings_layernorm.weight",
-    "word_embeddings_layernorm.bias",
-    "input_layernorm.weight",
-    "input_layernorm.bias",
-    "post_attention_layernorm.weight",
-    "post_attention_layernorm.bias",
-    "self_attention.dense.bias",
-    "mlp.dense_4h_to_h.bias",
-    "ln_f.weight",
-    "ln_f.bias",
-]
-
-WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
-    "mlp.dense_4h_to_h.weight",
-    "self_attention.dense.weight",
-]
-
-
-def layer_name_mapping(key, file):
-    """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only"""
-    # Handle first and last layers
-    layer_rename_map = {
-        "word_embeddings.weight": "word_embeddings.weight",
-        "word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
-        "word_embeddings.norm.bias": "word_embeddings_layernorm.bias",
-        "weight": "ln_f.weight",
-        "bias": "ln_f.bias",
-    }
-
-    if key in layer_rename_map:
-        return layer_rename_map[key]
-
-    # Handle transformer blocks
-    layer_number = int(re.match(r".*layer_(\d*).*", file)[1])
-    layer_number -= 3
-    return f"h.{layer_number}." + key
-
-
-def get_dtype_size(dtype):
-    if dtype == torch.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
-def convert_bloom_checkpoint_to_pytorch(
-    bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp
-):
-    # Construct model
-    if bloom_config_file == "":
-        config = BloomConfig()
-    else:
-        config = BloomConfig.from_json_file(bloom_config_file)
-
-    if shard_model:
-        file_names = os.listdir(bloom_checkpoint_path)
-        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
-
-        index_dict = {"weight_map": {}, "metadata": {}}
-        total_size = 0
-
-        missing_keys = None
-
-        config = BloomConfig()
-
-        for j, file in enumerate(file_names):
-            print("Processing file: {}".format(file))
-            tensors = None
-
-            for i in range(pretraining_tp):
-                # load all TP files
-                f_name = file.replace("model_00", f"model_0{i}")
-                temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu")
-
-                # Rename keys in the transformers names
-                keys = list(temp.keys())
-                for key in keys:
-                    temp[layer_name_mapping(key, file)] = temp.pop(key)
-
-                if tensors is None:
-                    tensors = temp
-                else:
-                    for key in tensors.keys():
-                        if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                            # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
-                            tensors[key] += temp[key]
-                        else:
-                            # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
-                            cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
-                            tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
-
-            # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
-                if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                    tensors[key] = tensors[key] / pretraining_tp
-            torch.save(
-                tensors,
-                os.path.join(
-                    pytorch_dump_folder_path,
-                    "pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)),
-                ),
-            )
-
-            for key in tensors.keys():
-                value = tensors[key]
-                total_size += value.numel() * get_dtype_size(value.dtype)
-                if key not in index_dict["weight_map"]:
-                    index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format(
-                        str(j + 1).zfill(5), str(len(file_names)).zfill(5)
-                    )
-
-        config = BloomConfig()
-        pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-        index_dict["metadata"]["total_size"] = total_size
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-        with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f:
-            json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n"
-            f.write(json_config)
-    else:
-        model = BloomModel(config)
-
-        file_names = os.listdir(bloom_checkpoint_path)
-        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
-
-        missing_keys = None
-        for i, file in enumerate(file_names):
-            tensors = None
-            for i in range(pretraining_tp):
-                # load all TP files
-                f_name = file.replace("model_00", f"model_0{i}")
-                temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu")
-
-                # Rename keys in the transformers names
-                keys = list(temp.keys())
-                for key in keys:
-                    temp[layer_name_mapping(key, file)] = temp.pop(key)
-
-                if tensors is None:
-                    tensors = temp
-                else:
-                    for key in tensors.keys():
-                        # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
-                        if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                            tensors[key] += temp[key]
-                        else:
-                            # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
-                            cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
-                            tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
-
-            # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
-                if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                    tensors[key] = tensors[key] / pretraining_tp
-
-            other_keys = model.load_state_dict(tensors, strict=False)
-            assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected"
-            if missing_keys is None:
-                missing_keys = set(other_keys.missing_keys)
-            else:
-                missing_keys = missing_keys.intersection(set(other_keys.missing_keys))
-
-        assert not missing_keys, f"The keys {missing_keys} are missing"
-
-        # Save pytorch-model
-        os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-        pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-        pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-        print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.torch_dtype}")
-        if config.torch_dtype is not None:
-            model = model.to(config.torch_dtype)
-        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print(f"Save configuration file to {pytorch_config_dump_path}")
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--bloom_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the Megatron-LM checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--bloom_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--shard_model",
-        action="store_true",
-        help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint",
-    )
-    parser.add_argument(
-        "--pretraining_tp",
-        default=4,
-        type=int,
-        help="Pretraining TP rank that has been used when training the model in Megatron-LM \n",
-    )
-    args = parser.parse_args()
-    convert_bloom_checkpoint_to_pytorch(
-        args.bloom_checkpoint_path,
-        args.bloom_config_file,
-        args.pytorch_dump_folder_path,
-        args.shard_model,
-        args.pretraining_tp,
-    )
diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py
deleted file mode 100644
index c0984f2c74b2..000000000000
--- a/src/transformers/models/bros/convert_bros_to_pytorch.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Bros checkpoints."""
-
-import argparse
-
-import bros  # original repo
-import torch
-
-from transformers import BrosConfig, BrosModel, BrosProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_configs(model_name):
-    bros_config = BrosConfig.from_pretrained(model_name)
-    return bros_config
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "embeddings.bbox_sinusoid_emb.inv_freq",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if name == "embeddings.bbox_projection.weight":
-        name = "bbox_embeddings.bbox_projection.weight"
-
-    if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq":
-        name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq"
-
-    if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq":
-        name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq"
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    # rename keys
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-        orig_state_dict[rename_key(key)] = val
-
-    # remove ignore keys
-    remove_ignore_keys_(orig_state_dict)
-
-    return orig_state_dict
-
-
-def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    original_model = bros.BrosModel.from_pretrained(model_name).eval()
-
-    # load HuggingFace Model
-    bros_config = get_configs(model_name)
-    model = BrosModel.from_pretrained(model_name, config=bros_config)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results
-
-    # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape
-    bbox = torch.tensor(
-        [
-            [
-                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850],
-                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
-                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
-                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
-                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
-                [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
-            ]
-        ]
-    )
-
-    processor = BrosProcessor.from_pretrained(model_name)
-
-    encoding = processor("His name is Rocco.", return_tensors="pt")
-    encoding["bbox"] = bbox
-
-    original_hidden_states = original_model(**encoding).last_hidden_state
-    # pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    last_hidden_states = model(**encoding).last_hidden_state
-
-    assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4)
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
-        processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="jinho8345/bros-base-uncased",
-        required=False,
-        type=str,
-        help="Name of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 9b1b15857cea..000000000000
--- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The T5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert T5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 45dcdb290333..000000000000
--- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert CANINE checkpoint."""
-
-import argparse
-
-from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
-    # Initialize PyTorch model
-    config = CanineConfig()
-    model = CanineModel(config)
-    model.eval()
-
-    print(f"Building PyTorch model from configuration: {config}")
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_canine(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model (weights and configuration)
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Save tokenizer files
-    tokenizer = CanineTokenizer()
-    print(f"Save tokenizer files to {pytorch_dump_path}")
-    tokenizer.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint. Should end with model.ckpt",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a folder where the PyTorch model will be placed.",
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path)
diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
deleted file mode 100644
index ff45c9b597e0..000000000000
--- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-
-import requests
-import torch
-import yaml
-from accelerate import init_empty_weights
-from PIL import Image
-
-from transformers import (
-    ChameleonConfig,
-    ChameleonForConditionalGeneration,
-    ChameleonImageProcessor,
-    ChameleonProcessor,
-)
-
-
-try:
-    from transformers import LlamaTokenizerFast
-except ImportError:
-    raise ValueError(
-        "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! "
-        "Update your `tokenizers` library and re-run the tokenizer conversion."
-    )
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \
-    --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast
-
-model = ChameleonForConditionalGeneration.from_pretrained("/output/path")
-tokenizer = LlamaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-NUM_SHARDS = {
-    "7B": 1,
-    "30B": 4,
-}
-
-VOCAB_SIZE = 65536
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, model_size, chameleon_version=1):
-    os.makedirs(model_path, exist_ok=True)
-    input_model_path = os.path.join(input_base_path, "models", model_size.lower())
-    params_path = os.path.join(input_model_path, "params.json")
-    consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json")
-
-    params = read_json(params_path)
-    if os.path.isfile(consolidate_params_path):
-        params = {**params, **read_json(consolidate_params_path)}
-    num_shards = NUM_SHARDS[model_size]
-    model_parallel_size = params["model_parallel_size"]
-    params = params.get("model", params)
-    n_layers = params["n_layers"]
-    n_heads = params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["dim"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    swin_norm = params["swin_norm"]
-    if base > 10000.0:
-        max_position_embeddings = 16384
-    else:
-        # Depending on the Chameleon version, the default max_position_embeddings has different values.
-        if chameleon_version == 1:
-            max_position_embeddings = 4096
-        else:
-            raise NotImplementedError(
-                f"Version {chameleon_version} of chameleon is not supported yet. "
-                "Current supported versions of chameleon are [1]."
-            )
-
-    if params.get("n_kv_heads", None) is not None:
-        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
-        key_value_dim = dim // num_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_local_key_value_heads = n_heads_per_shard
-        key_value_dim = dim
-
-    print(f"Fetching all parameters from the checkpoint at {input_model_path}.")
-    # Load weights
-    if num_shards == 1:
-        # Not sharded
-        # (The sharded implementation would also work, but this is simpler.)
-        loaded = None
-        for possible_name in ["consolidated.pth", "consolidated.00.pth"]:
-            possible_path = os.path.join(input_model_path, possible_name)
-            if os.path.exists(possible_path):
-                loaded = torch.load(possible_path, map_location="cpu")
-                break
-        assert loaded is not None
-    else:
-        # Sharded
-        loaded = [
-            torch.load(os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
-            for i in range(num_shards)
-        ]
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    # Load weights to the state dict
-    state_dict = {}
-    for layer_i in range(n_layers):
-        if num_shards == 1:
-            # Unsharded
-            state_dict.update(
-                {
-                    f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
-                    ),
-                    f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wk.weight"],
-                        n_heads=num_key_value_heads,
-                        dim1=key_value_dim,
-                    ),
-                    f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
-                    f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
-                    f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
-                    f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
-                    f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ],
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ],
-                }
-            )
-            # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677)
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = (
-                loaded[f"layers.{layer_i}.attention.q_normalization.weight"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(n_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = (
-                loaded[f"layers.{layer_i}.attention.q_normalization.bias"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(n_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = (
-                loaded[f"layers.{layer_i}.attention.k_normalization.weight"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(num_key_value_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = (
-                loaded[f"layers.{layer_i}.attention.k_normalization.bias"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(num_key_value_heads, 0)
-            )
-
-        else:
-            # Sharded
-            state_dict.update(
-                {
-                    f"model.layers.{layer_i}.input_layernorm.weight": torch.stack(
-                        [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded]
-                    ).mean(dim=0),
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack(
-                        [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded]
-                    ).mean(dim=0),
-                }
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-                torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
-                        for i in range(num_shards)
-                    ],
-                    dim=0,
-                ).reshape(dim, dim),
-                n_heads=n_heads,
-            )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-                torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
-                            num_local_key_value_heads, dims_per_head, dim
-                        )
-                        for i in range(num_shards)
-                    ],
-                    dim=0,
-                ).reshape(key_value_dim, dim),
-                n_heads=num_key_value_heads,
-                dim1=key_value_dim,
-            )
-
-            # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677)
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(n_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(n_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(num_key_value_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(num_key_value_heads // num_shards, 0)
-            )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
-                [
-                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
-                        num_local_key_value_heads, dims_per_head, dim
-                    )
-                    for i in range(num_shards)
-                ],
-                dim=0,
-            ).reshape(key_value_dim, dim)
-
-            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
-            )
-
-    if num_shards == 1:
-        # Unsharded
-        state_dict.update(
-            {
-                "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-                "model.norm.weight": loaded["norm.weight"],
-                "lm_head.weight": loaded["output.weight"],
-            }
-        )
-    else:
-        state_dict.update(
-            {
-                "model.embed_tokens.weight": torch.cat(
-                    [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
-                ),
-                "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0),
-                "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
-            }
-        )
-
-    # Load VQGAN weights
-    vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt")
-    vqgan_state_dict = torch.load(vqgan_path, map_location="cpu")["state_dict"]
-    for k, v in vqgan_state_dict.items():
-        if "decoder" in k:
-            continue  # we dont do image generation yet
-        state_dict[f"model.vqmodel.{k}"] = v
-
-    # Write configs
-    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
-    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
-
-    with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file:
-        tokenizer_config = json.load(tokenizer_file)
-        vocabulary_map = tokenizer_config["model"]["vocab"]
-        vocabulary_map["<image>"] = vocabulary_map[
-            "<reserved08707>"
-        ]  # use a reserved token instead of adding a new one
-        del vocabulary_map["<reserved08707>"]
-
-        for token in tokenizer_config["added_tokens"]:
-            if token["content"] == "<reserved08707>":
-                token["content"] = "<image>"
-
-    with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f:
-        json.dump(tokenizer_config, f)  # save the new file to init tokenizer later
-
-    vq_keys_to_replace = [
-        ("ch", "base_channels"),
-        ("out_ch", "out_channels"),
-        ("n_embed", "num_embeddings"),
-        ("ch_mult", "channel_multiplier"),
-        ("double_z", "double_latent"),
-        ("z_channels", "latent_channels"),
-    ]
-    with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file:
-        vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"]
-        vq_config.update(**vq_config["ddconfig"])
-        for old, new in vq_keys_to_replace:
-            vq_config[new] = vq_config[old]
-        del vq_config["ddconfig"]
-        del vq_config["ckpt_path"]
-        del vq_config["lossconfig"]
-
-    config = ChameleonConfig(
-        hidden_size=dim,
-        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
-        num_attention_heads=params["n_heads"],
-        num_hidden_layers=params["n_layers"],
-        rms_norm_eps=params["norm_eps"],
-        num_key_value_heads=num_key_value_heads,
-        vocab_size=VOCAB_SIZE,
-        rope_theta=base,
-        max_position_embeddings=max_position_embeddings,
-        model_parallel_size=model_parallel_size,
-        swin_norm=swin_norm,
-        vq_config=vq_config,
-        vocabulary_map=vocabulary_map,
-    )
-    with init_empty_weights():
-        model = ChameleonForConditionalGeneration(config)
-
-    model.load_state_dict(state_dict, assign=True, strict=False)
-    model.save_pretrained(model_path, safe_serialization=True)
-
-    # Load and save the processor
-    tokenizer = LlamaTokenizerFast(
-        tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False
-    )
-    tokenizer.sep_token_id = 8710  # assign <reserved08706> to sep so that we can append it after input text
-    tokenizer.pad_token_id = 1  # assing <pad> to special pad_token
-    image_processor = ChameleonImageProcessor()
-    processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    processor.save_pretrained(model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    del vqgan_state_dict
-    gc.collect()
-
-    # Short inference on a few examples to check if generation makes sense
-    # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
-    print("Loading the checkpoint in a Chameleon model...")
-    print("*" * 100)
-    model = ChameleonForConditionalGeneration.from_pretrained(
-        model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
-    )
-    processor = ChameleonProcessor.from_pretrained(model_path)
-
-    prompt = "I'm very intrigued by this work of art:<image>Please tell me about the artist."
-    image = Image.open(
-        requests.get(
-            "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True
-        ).raw
-    )
-    inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16)
-    length = inputs.input_ids.shape[1]
-
-    out = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-    generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-    print(f"Generation for single-image: {generated_text}")
-    print("*" * 100)
-
-    # Multi-image example
-    prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.<image><image>I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation."
-    image = Image.open(
-        requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
-    )
-    image_2 = Image.open(
-        requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
-    )
-
-    inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16)
-    length = inputs.input_ids.shape[1]
-    out = model.generate(**inputs, max_new_tokens=50, do_sample=False)
-    generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-    print(f"Generation for multi-image: {generated_text}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Chameleon weights",
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B", "30B"],
-        help=""
-        " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model",
-    )
-    parser.add_argument(
-        "--test_inference",
-        action="store_true",
-        help="Whether to load the model for generation to test it's converted correctly.",
-    )
-    # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    parser.add_argument(
-        "--chameleon_version",
-        choices=[1],
-        default=1,
-        type=int,
-        help="Version of the Chameleon model to convert",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        model_size=args.model_size,
-        chameleon_version=args.chameleon_version,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 02c4b7b754b2..000000000000
--- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import ChineseCLIPConfig, ChineseCLIPModel
-
-
-def copy_attn_layer(hf_attn_layer, pt_weights, prefix):
-    q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0)
-
-    out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"]
-    out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"]
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight.data = out_proj_weights
-    hf_attn_layer.out_proj.bias.data = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_weights, prefix):
-    copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc")
-    copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj")
-
-
-def copy_linear(hf_linear, pt_weights, prefix):
-    hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data
-    hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data
-
-
-def copy_layer(hf_layer, pt_weights, prefix):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1")
-    copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2")
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp")
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn")
-
-
-def copy_layers(hf_layers, pt_weights, prefix):
-    for layer_id, hf_layer in enumerate(hf_layers):
-        copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}")
-
-
-def copy_text_model_and_projection(hf_model, pt_weights):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T
-
-    # copy text encoder
-    for name, param in hf_model.text_model.named_parameters():
-        param.data = pt_weights[f"bert.{name}"].data
-
-
-def copy_vision_model_and_projection(hf_model, pt_weights):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre")
-    copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post")
-
-    # copy embeddings
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data
-    hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks")
-
-
-@torch.no_grad()
-def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size."
-    config = ChineseCLIPConfig.from_pretrained(config_path)
-
-    hf_model = ChineseCLIPModel(config).eval()
-
-    pt_weights = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()}
-
-    copy_text_model_and_projection(hf_model, pt_weights)
-    copy_vision_model_and_projection(hf_model, pt_weights)
-    hf_model.logit_scale.data = pt_weights["logit_scale"].data
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output folder storing converted hf PyTorch model.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint."
-    )
-    parser.add_argument(
-        "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert."
-    )
-    args = parser.parse_args()
-
-    convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
-    print("The conversion is finished!")
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
deleted file mode 100644
index d422bc45ab3d..000000000000
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-from laion_clap import CLAP_Module
-
-from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "text_branch": "text_model",
-    "audio_branch": "audio_model.audio_encoder",
-    "attn": "attention.self",
-    "self.proj": "output.dense",
-    "attention.self_mask": "attn_mask",
-    "mlp.fc1": "intermediate.dense",
-    "mlp.fc2": "output.dense",
-    "norm1": "layernorm_before",
-    "norm2": "layernorm_after",
-    "bn0": "batch_norm",
-}
-
-processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")
-
-
-def init_clap(checkpoint_path, model_type, enable_fusion=False):
-    model = CLAP_Module(
-        amodel=model_type,
-        enable_fusion=enable_fusion,
-    )
-    model.load_ckpt(checkpoint_path)
-    return model
-
-
-def get_config_from_original(clap_model):
-    audio_config = {
-        "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim,
-        "depths": clap_model.model.audio_branch.depths,
-        "hidden_size": clap_model.model.audio_projection[0].in_features,
-    }
-
-    text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features}
-
-    return ClapConfig(audio_config=audio_config, text_config=text_config)
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-
-    sequential_layers_pattern = r".*sequential.(\d+).*"
-    text_projection_pattern = r".*_projection.(\d+).*"
-
-    for key, value in state_dict.items():
-        # check if any key needs to be modified
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(sequential_layers_pattern, key):
-            # replace sequential layers with list
-            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
-
-            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
-        elif re.match(text_projection_pattern, key):
-            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
-
-            # Because in CLAP they use `nn.Sequential`...
-            transformers_projection_layer = 1 if projecton_layer == 0 else 2
-
-            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
-
-        if "audio" and "qkv" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            model_state_dict[key.replace("qkv", "query")] = query_layer
-            model_state_dict[key.replace("qkv", "key")] = key_layer
-            model_state_dict[key.replace("qkv", "value")] = value_layer
-        else:
-            model_state_dict[key] = value
-
-    return model_state_dict
-
-
-def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False):
-    clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion)
-
-    clap_model.eval()
-    state_dict = clap_model.model.state_dict()
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = get_config_from_original(clap_model)
-    transformers_config.audio_config.enable_fusion = enable_fusion
-    model = ClapModel(transformers_config)
-
-    # ignore the spectrogram embedding layer
-    model.load_state_dict(state_dict, strict=False)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
-    parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not")
-    args = parser.parse_args()
-
-    convert_clap_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion
-    )
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 3d88fc1929c3..000000000000
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from clip import load
-
-from transformers import CLIPConfig, CLIPModel
-
-
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vison_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-@torch.no_grad()
-def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = CLIPConfig.from_pretrained(config_path)
-    else:
-        config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = CLIPModel(config).eval()
-
-    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
-    pt_model = pt_model.eval()
-
-    copy_text_model_and_projection(hf_model, pt_model)
-    copy_vison_model_and_projection(hf_model, pt_model)
-    hf_model.logit_scale = pt_model.logit_scale
-
-    # Use `eos_token` so the example is more meaningful
-    input_ids = torch.tensor(
-        [
-            [config.text_config.bos_token_id]
-            + list(range(3, 77))
-            + [config.text_config.eos_token_id]
-            + [config.text_config.pad_token_id]
-        ]
-    )
-    pixel_values = torch.randn(1, 3, 224, 224)
-
-    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
-    hf_logits_per_image = hf_outputs.logits_per_image
-    hf_logits_per_text = hf_outputs.logits_per_text
-    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
-
-    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
-    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
deleted file mode 100644
index c614d61e5b3d..000000000000
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg."""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    CLIPSegConfig,
-    CLIPSegForImageSegmentation,
-    CLIPSegProcessor,
-    CLIPSegTextConfig,
-    CLIPSegVisionConfig,
-    CLIPTokenizer,
-    ViTImageProcessor,
-)
-
-
-def get_clipseg_config(model_name):
-    text_config = CLIPSegTextConfig()
-    vision_config = CLIPSegVisionConfig(patch_size=16)
-
-    use_complex_transposed_convolution = True if "refined" in model_name else False
-    reduce_dim = 16 if "rd16" in model_name else 64
-
-    config = CLIPSegConfig.from_text_vision_configs(
-        text_config,
-        vision_config,
-        use_complex_transposed_convolution=use_complex_transposed_convolution,
-        reduce_dim=reduce_dim,
-    )
-    return config
-
-
-def rename_key(name):
-    # update prefixes
-    if "clip_model" in name:
-        name = name.replace("clip_model", "clip")
-    if "transformer" in name:
-        if "visual" in name:
-            name = name.replace("visual.transformer", "vision_model")
-        else:
-            name = name.replace("transformer", "text_model")
-    if "resblocks" in name:
-        name = name.replace("resblocks", "encoder.layers")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if "attn" in name and "self" not in name:
-        name = name.replace("attn", "self_attn")
-    # text encoder
-    if "token_embedding" in name:
-        name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
-    if "positional_embedding" in name and "visual" not in name:
-        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "text_model.final_layer_norm")
-    # vision encoder
-    if "visual.class_embedding" in name:
-        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
-    if "visual.conv1" in name:
-        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
-    if "visual.positional_embedding" in name:
-        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
-    if "visual.ln_pre" in name:
-        name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm")
-    if "visual.ln_post" in name:
-        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
-    # projection layers
-    if "visual.proj" in name:
-        name = name.replace("visual.proj", "visual_projection.weight")
-    if "text_projection" in name:
-        name = name.replace("text_projection", "text_projection.weight")
-    # decoder
-    if "trans_conv" in name:
-        name = name.replace("trans_conv", "transposed_convolution")
-    if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name:
-        name = "decoder." + name
-    if "blocks" in name:
-        name = name.replace("blocks", "decoder.layers")
-    if "linear1" in name:
-        name = name.replace("linear1", "mlp.fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "mlp.fc2")
-    if "norm1" in name and "layer_" not in name:
-        name = name.replace("norm1", "layer_norm1")
-    if "norm2" in name and "layer_" not in name:
-        name = name.replace("norm2", "layer_norm2")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("clip_model") and "attn.in_proj" in key:
-            key_split = key.split(".")
-            if "visual" in key:
-                layer_num = int(key_split[4])
-                dim = config.vision_config.hidden_size
-                prefix = "vision_model"
-            else:
-                layer_num = int(key_split[3])
-                dim = config.text_config.hidden_size
-                prefix = "text_model"
-
-            if "weight" in key:
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        elif "self_attn" in key and "out_proj" not in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            dim = config.reduce_dim
-            if "weight" in key:
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_name = rename_key(key)
-            if "visual_projection" in new_name or "text_projection" in new_name:
-                val = val.T
-            orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    config = get_clipseg_config(model_name)
-    model = CLIPSegForImageSegmentation(config)
-    model.eval()
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # remove some keys
-    for key in state_dict.copy().keys():
-        if key.startswith("model"):
-            state_dict.pop(key, None)
-
-    # rename some keys
-    state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-
-    if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
-        raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
-    if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
-        raise ValueError(f"Unexpected keys: {unexpected_keys}")
-
-    image_processor = ViTImageProcessor(size=352)
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    image = prepare_img()
-    text = ["a glass", "something to fill", "wood", "a jar"]
-
-    inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    # verify values
-    expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
-    expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
-    if model_name == "clipseg-rd64-refined":
-        expected_masks_slice = torch.tensor(
-            [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
-        )
-    elif model_name == "clipseg-rd64":
-        expected_masks_slice = torch.tensor(
-            [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
-        )
-    elif model_name == "clipseg-rd16":
-        expected_masks_slice = torch.tensor(
-            [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]]
-        )
-    else:
-        raise ValueError(f"Model name {model_name} not supported.")
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)
-    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
-    assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to the hub")
-        model.push_to_hub(f"CIDAS/{model_name}")
-        processor.push_to_hub(f"CIDAS/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="clipseg-rd64",
-        type=str,
-        choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"],
-        help=(
-            "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
-            " reduce dimension)"
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
-        type=str,
-        help=(
-            "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and"
-            " the decoder weights."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/clvp/convert_clvp_to_hf.py b/src/transformers/models/clvp/convert_clvp_to_hf.py
deleted file mode 100644
index 4ae6fd425497..000000000000
--- a/src/transformers/models/clvp/convert_clvp_to_hf.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Weights conversion script for CLVP
-"""
-
-import argparse
-import os
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import ClvpConfig, ClvpModelForConditionalGeneration
-
-
-_MODELS = {
-    "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth",
-    "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth",
-}
-
-dim = 1024
-sub_dim = dim // 16
-
-CLVP_ENCODERS_MAPPING = {
-    "text_transformer.transformer.attn_layers": "text_encoder_model",
-    "speech_transformer.transformer.attn_layers": "speech_encoder_model",
-    "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm",
-    "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm",
-    "to_text_latent": "text_encoder_model.projection",
-    "to_speech_latent": "speech_encoder_model.projection",
-    "text_emb": "text_encoder_model.token_embedding",
-    "speech_emb": "speech_encoder_model.token_embedding",
-    "1.wrap.net.0": "mlp.fc1",
-    "1.wrap.net.3": "mlp.fc2",
-    "1.wrap": "self_attn",
-    "to_out": "out_proj",
-    "to_q": "q_proj",
-    "to_k": "k_proj",
-    "to_v": "v_proj",
-    "temperature": "logit_scale",
-}
-
-CLVP_DECODER_MAPPING = {
-    "conditioning_encoder.init": "conditioning_encoder.mel_conv",
-    "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks",
-    "mel_attn_blocks": "group_norms",
-    ".norm.weight": ".weight",
-    ".norm.bias": ".bias",
-    "text_embedding": "conditioning_encoder.text_token_embedding",
-    "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding",
-    "final_norm": "speech_decoder_model.final_norm",
-    "mel_head": "speech_decoder_model.lm_head",
-    "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm",
-    "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer",
-    "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer",
-    "gpt.h": "speech_decoder_model.model.decoder.layers",
-    "ln_1": "input_layernorm",
-    "ln_2": "post_attention_layernorm",
-}
-
-
-def update_index(present_index):
-    if present_index % 2 == 0:
-        return int(present_index / 2)
-    else:
-        return int((present_index - 1) / 2)
-
-
-def convert_encoder_weights(original_weights):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-    for original_key in original_weights_keys:
-        updated_key = original_key
-        # for input_rmsnorm.weight and post_attention_rmsnorm.weight
-        if "0.0.g" in updated_key:
-            present_index = updated_key.split(".")[4]
-            if int(present_index) % 2 == 0:
-                updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight")
-            else:
-                updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight")
-
-        if "transformer.attn_layers.layers" in updated_key:
-            present_index = updated_key.split(".")[4]
-            updated_index = update_index(int(present_index))
-            updated_key = updated_key.replace(
-                f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}"
-            )
-
-        for k, v in CLVP_ENCODERS_MAPPING.items():
-            if k in updated_key:
-                updated_key = updated_key.replace(k, v)
-
-        converted_weights[updated_key] = original_weights.pop(original_key)
-
-    return converted_weights
-
-
-def convert_decoder_weights(original_weights):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-    for original_key in original_weights_keys:
-        updated_key = original_key
-        if len(updated_key.split(".")) > 3:
-            index, attr = updated_key.split(".")[2], updated_key.split(".")[-1]
-
-        # for decoder attention
-        if "attn.c_attn" in updated_key:
-            if attr == "weight":
-                slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).T.split(split_size=dim, dim=0)
-            else:
-                slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0)
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.q_proj.{attr}"] = slice1
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.k_proj.{attr}"] = slice2
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.v_proj.{attr}"] = slice3
-            continue
-
-        if "attn.c_proj" in updated_key:
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.out_proj.{attr}"] = (
-                original_weights[updated_key].squeeze(-1).T
-            )
-            continue
-
-        if "attn.bias" in updated_key or "attn.masked_bias" in updated_key or "text_head" in updated_key:
-            original_weights.pop(updated_key)
-            continue
-
-        # conditional encoder attention
-        if "qkv" in updated_key:
-            if attr == "weight":
-                slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).split(split_size=dim, dim=0)
-            else:
-                slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0)
-
-            indices = torch.arange(dim)
-            index1, index2, index3 = (
-                indices.unfold(0, sub_dim, sub_dim * 3).flatten(),
-                indices[sub_dim:].unfold(0, sub_dim, sub_dim * 3).flatten(),
-                indices[2 * sub_dim :].unfold(0, sub_dim, sub_dim * 3).flatten(),
-            )
-
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.q_proj.{attr}"] = torch.concatenate(
-                [slice1[index1], slice2[index3], slice3[index2]],
-                axis=0,
-            )
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.k_proj.{attr}"] = torch.concatenate(
-                [slice1[index2], slice2[index1], slice3[index3]],
-                axis=0,
-            )
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.v_proj.{attr}"] = torch.concatenate(
-                [slice1[index3], slice2[index2], slice3[index1]],
-                axis=0,
-            )
-            continue
-
-        if "proj_out" in updated_key:
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.out_proj.{attr}"] = original_weights[
-                updated_key
-            ].squeeze(-1)
-            continue
-
-        for k, v in CLVP_DECODER_MAPPING.items():
-            if k in updated_key:
-                updated_key = updated_key.replace(k, v)
-
-        converted_weights[updated_key] = original_weights.pop(original_key)
-
-    return converted_weights
-
-
-def _download(url: str, root: str):
-    repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
-    filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}"
-    hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        force_filename=root,
-        local_dir_use_symlinks=False,
-    )
-
-
-def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path):
-    converted_checkpoint = {}
-
-    for each_model_name, each_model_url in _MODELS.items():
-        each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1])
-        if not os.path.exists(each_model_path):
-            print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}")
-            _download(url=each_model_url, root=each_model_path)
-
-        if each_model_name == "clvp":
-            clvp_checkpoint = torch.load(each_model_path, map_location="cpu")
-        else:
-            decoder_checkpoint = torch.load(each_model_path, map_location="cpu")
-
-    # Converting the weights
-    converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint))
-    converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint))
-
-    config = ClvpConfig.from_pretrained("susnato/clvp_dev")
-    model = ClvpModelForConditionalGeneration(config)
-
-    model.load_state_dict(converted_checkpoint, strict=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Model saved at {pytorch_dump_folder_path}!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model. (Please enter full path)",
-    )
-    args = parser.parse_args()
-
-    convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py b/src/transformers/models/colpali/convert_colpali_weights_to_hf.py
deleted file mode 100644
index 1b30f3f97acd..000000000000
--- a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert ColPali weights from the original repository to the HF model format.
-
-Original repository: https://github.com/illuin-tech/colpali.
-
-NOTE: This script was originally run using `torch==2.5.1` and with:
-
-```bash
-python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-    --model_id vidore/colpali-v1.2-merged \
-    --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \
-    --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-    --output_dir vidore/colpali-v1.2-hf-internal \
-    --push_to_hub
-
-python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-    --model_id vidore/colpali-v1.3-merged \
-    --revision 5b955e3415a7c5468ab33119d98d6d45c3a5b2c3 \
-    --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-    --output_dir vidore/colpali-v1.3-hf \
-    --push_to_hub
-```
-"""
-
-import argparse
-import glob
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import AutoConfig
-from transformers.models.colpali import ColPaliForRetrieval
-from transformers.models.colpali.configuration_colpali import ColPaliConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-ORIGINAL_DTYPE = torch.bfloat16
-
-
-def rename_state_dict_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key
-        if key.startswith("custom_text_proj"):
-            new_key = key.replace("custom_text_proj", "embedding_proj_layer")
-        if key.startswith("model."):
-            new_key = key.replace("model.", "vlm.", 1)
-        new_state_dict[new_key] = value
-    return new_state_dict
-
-
-def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]:
-    directory_path = snapshot_download(
-        repo_id=model_id,
-        revision=revision,
-        allow_patterns=["*.safetensors"],
-    )
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["vlm.language_model.lm_head.weight"] = original_state_dict[
-            "model.language_model.model.embed_tokens.weight"
-        ].clone()
-
-    return original_state_dict
-
-
-@torch.no_grad()
-def convert_colpali_weights_to_hf(
-    model_id: str,
-    output_dir: str,
-    push_to_hub: bool,
-    revision: Optional[str] = None,
-    original_vlm_name_or_path: Optional[str] = None,
-):
-    # Load the original model data
-    original_config = AutoConfig.from_pretrained(
-        model_id,
-        revision=revision,
-    )
-    if original_vlm_name_or_path is not None:
-        original_config._name_or_path = original_vlm_name_or_path
-    if hasattr(original_config, "architectures"):
-        delattr(original_config, "architectures")
-
-    original_state_dict = load_original_state_dict(model_id, revision=revision)
-
-    # Format the state_dict keys
-    original_state_dict = rename_state_dict_keys(original_state_dict)
-
-    # Create the new config
-    config = ColPaliConfig(
-        vlm_config=original_config,
-        embedding_dim=128,  # hardcoded in the original model
-    )
-    config.model_type = "colpali"
-    config.is_composition = False
-
-    # Load the untrained model
-    model = ColPaliForRetrieval(config=config).to("cpu").eval()
-    print("Created model with new config and randomly initialized weights")
-
-    # NOTE: The model was initialized with float32 weights. We need to convert it to the desired precision.
-    # There are two ways to set the model's dtype:
-    # - Using `model.from_pretrained(..., torch_dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision.
-    # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision.
-    # The following snippet allows a fine-grained control over the model's dtype, making sure that all
-    # the new weights' dtypes match the original model.
-    for param in model.parameters():
-        param.data = param.data.to(ORIGINAL_DTYPE)
-    print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`")
-
-    # Load the original weights
-    model.load_state_dict(original_state_dict)
-    print("Loaded original model weights")
-
-    # Tie the weights (following ColPali's `__init__`` step)
-    if model.vlm.language_model._tied_weights_keys is not None:
-        model._tied_weights_keys = [f"vlm.language_model.{k}" for k in model.vlm.language_model._tied_weights_keys]
-
-    # Sanity check: ensure all keys are the same
-    state_dict_keys_old = set(original_state_dict.keys())
-    state_dict_keys_new = set(model.state_dict().keys())
-    disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new)
-    if disjoint_keys:
-        raise ValueError(f"Incompatible keys: {disjoint_keys}")
-
-    # Save the model
-    if push_to_hub:
-        model.push_to_hub(output_dir, private=True)
-        print(f"Model pushed to the hub at `{output_dir}`")
-    else:
-        Path(output_dir).mkdir(exist_ok=True, parents=True)
-        model.save_pretrained(output_dir)
-        print(f"Model saved to `{output_dir}`")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        This script converts the original ColPali model to the HF model format.
-
-        Example usage:
-        ```bash
-        python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-            --model_id vidore/colpali-v1.2-merged \
-            --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \
-            --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-            --output_dir vidore/colpali-v1.2-hf \
-            --push_to_hub
-        ```
-        """
-    )
-    parser.add_argument(
-        "--model_id",
-        help="Model ID of the original model to convert",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--revision",
-        help="Revision of the model to download",
-        default=None,
-    )
-    parser.add_argument(
-        "--original_vlm_name_or_path",
-        help="Name or path of the original VLM backbone model",
-        default=None,
-    )
-    args = parser.parse_args()
-
-    convert_colpali_weights_to_hf(
-        model_id=args.model_id,
-        output_dir=args.output_dir,
-        push_to_hub=args.push_to_hub,
-        revision=args.revision,
-        original_vlm_name_or_path=args.original_vlm_name_or_path,
-    )
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 91f00668be69..000000000000
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Conditional DETR checkpoints."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    ConditionalDetrConfig,
-    ConditionalDetrForObjectDetection,
-    ConditionalDetrForSegmentation,
-    ConditionalDetrImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
-    )
-    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
-    )
-
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
-    )
-    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
-    )
-
-# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-# for conditional DETR, also convert reference point head and query scale MLP
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
-        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
-        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
-        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
-        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
-        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
-        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
-        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "conditional_detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
-    """
-
-    # load default config
-    config = ConditionalDetrConfig()
-    # set backbone and dilation attributes
-    if "resnet101" in model_name:
-        config.backbone = "resnet101"
-    if "dc5" in model_name:
-        config.dilation = True
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    image_processor = ConditionalDetrImageProcessor(format=format)
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original model from torch hub
-    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
-    state_dict = conditional_detr.state_dict()
-    # rename keys
-    for src, dest in rename_keys:
-        if is_panoptic:
-            src = "conditional_detr." + src
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "conditional_detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("conditional_detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["conditional_detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["conditional_detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
-    # verify our conversion
-    original_outputs = conditional_detr(pixel_values)
-    outputs = model(pixel_values)
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="conditional_detr_resnet50",
-        type=str,
-        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
deleted file mode 100644
index 3d4ff779874b..000000000000
--- a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvBERT checkpoint."""
-
-import argparse
-
-from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path):
-    conf = ConvBertConfig.from_json_file(convbert_config_file)
-    model = ConvBertModel(conf)
-
-    model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path)
-    model.save_pretrained(pytorch_dump_path)
-
-    tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True)
-    tf_model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--convbert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained ConvBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
deleted file mode 100644
index 27315ed73f91..000000000000
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNext checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ConvNeXt"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_convnext_config(checkpoint_url):
-    config = ConvNextConfig()
-
-    if "tiny" in checkpoint_url:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "small" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-    if "large" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-    if "xlarge" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [256, 512, 1024, 2048]
-
-    if "1k" in checkpoint_url:
-        num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        expected_shape = (1, 1000)
-    else:
-        num_labels = 21841
-        filename = "imagenet-22k-id2label.json"
-        expected_shape = (1, 21841)
-
-    repo_id = "huggingface/label-files"
-    config.num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    if "1k" not in checkpoint_url:
-        # this dataset contains 21843 labels but the model only has 21841
-        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
-        del id2label[9205]
-        del id2label[15027]
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.hidden_sizes = hidden_sizes
-    config.depths = depths
-
-    return config, expected_shape
-
-
-def rename_key(name):
-    if "downsample_layers.0.0" in name:
-        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
-    if "downsample_layers.0.1" in name:
-        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # we rename to layernorm later on
-    if "downsample_layers.1.0" in name:
-        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
-    if "downsample_layers.1.1" in name:
-        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
-    if "downsample_layers.2.0" in name:
-        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
-    if "downsample_layers.2.1" in name:
-        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
-    if "downsample_layers.3.0" in name:
-        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
-    if "downsample_layers.3.1" in name:
-        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
-    if "stages" in name and "downsampling_layer" not in name:
-        # stages.0.0. for instance should be renamed to stages.0.layers.0.
-        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
-    if "stages" in name:
-        name = name.replace("stages", "encoder.stages")
-    if "norm" in name:
-        name = name.replace("norm", "layernorm")
-    if "gamma" in name:
-        name = name.replace("gamma", "layer_scale_parameter")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ConvNext structure.
-    """
-
-    # define ConvNext configuration based on URL
-    config, expected_shape = get_convnext_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if not key.startswith("classifier"):
-            key = "convnext." + key
-        state_dict[key] = val
-
-    # load HuggingFace model
-    model = ConvNextForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image, prepared by ConvNextImageProcessor
-    size = 224 if "224" in checkpoint_url else 384
-    image_processor = ConvNextImageProcessor(size=size)
-    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
-
-    logits = model(pixel_values).logits
-
-    # note: the logits below were obtained without center cropping
-    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth":
-        expected_logits = torch.tensor([0.4525, 0.7539, 0.0308])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth":
-        expected_logits = torch.tensor([0.3561, 0.6350, -0.0384])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth":
-        expected_logits = torch.tensor([0.4174, -0.0989, 0.1489])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth":
-        expected_logits = torch.tensor([0.2513, -0.1349, -0.1613])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth":
-        expected_logits = torch.tensor([1.2980, 0.3631, -0.1198])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth":
-        expected_logits = torch.tensor([1.2963, 0.1227, 0.1723])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth":
-        expected_logits = torch.tensor([1.7956, 0.8390, 0.2820])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth":
-        expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth":
-        expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth":
-        expected_logits = torch.tensor([0.2681, 0.2365, 0.6246])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth":
-        expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth":
-        expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444])
-    else:
-        raise ValueError(f"Unknown URL: {checkpoint_url}")
-
-    assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3)
-    assert logits.shape == expected_shape
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    print("Pushing model to the hub...")
-    model_name = "convnext"
-    if "tiny" in checkpoint_url:
-        model_name += "-tiny"
-    elif "small" in checkpoint_url:
-        model_name += "-small"
-    elif "base" in checkpoint_url:
-        model_name += "-base"
-    elif "xlarge" in checkpoint_url:
-        model_name += "-xlarge"
-    elif "large" in checkpoint_url:
-        model_name += "-large"
-    if "224" in checkpoint_url:
-        model_name += "-224"
-    elif "384" in checkpoint_url:
-        model_name += "-384"
-    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
-        model_name += "-22k"
-    if "22k" in checkpoint_url and "1k" in checkpoint_url:
-        model_name += "-22k-1k"
-
-    model.push_to_hub(
-        repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-        organization="nielsr",
-        commit_message="Add model",
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
-        type=str,
-        help="URL of the original ConvNeXT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-
-    args = parser.parse_args()
-    convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
deleted file mode 100644
index 8094ecf0d615..000000000000
--- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNeXTV2 checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ConvNeXt"""
-
-import argparse
-import json
-import os
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_convnextv2_config(checkpoint_url):
-    config = ConvNextV2Config()
-
-    if "atto" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [40, 80, 160, 320]
-    if "femto" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [48, 96, 192, 384]
-    if "pico" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [64, 128, 256, 512]
-    if "nano" in checkpoint_url:
-        depths = [2, 2, 8, 2]
-        hidden_sizes = [80, 160, 320, 640]
-    if "tiny" in checkpoint_url:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-    if "large" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-    if "huge" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [352, 704, 1408, 2816]
-
-    num_labels = 1000
-    filename = "imagenet-1k-id2label.json"
-    expected_shape = (1, 1000)
-
-    repo_id = "huggingface/label-files"
-    config.num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.hidden_sizes = hidden_sizes
-    config.depths = depths
-
-    return config, expected_shape
-
-
-def rename_key(name):
-    if "downsample_layers.0.0" in name:
-        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
-    if "downsample_layers.0.1" in name:
-        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # we rename to layernorm later on
-    if "downsample_layers.1.0" in name:
-        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
-    if "downsample_layers.1.1" in name:
-        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
-    if "downsample_layers.2.0" in name:
-        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
-    if "downsample_layers.2.1" in name:
-        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
-    if "downsample_layers.3.0" in name:
-        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
-    if "downsample_layers.3.1" in name:
-        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
-    if "stages" in name and "downsampling_layer" not in name:
-        # stages.0.0. for instance should be renamed to stages.0.layers.0.
-        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
-    if "gamma" in name:
-        name = name.replace("gamma", "weight")
-    if "beta" in name:
-        name = name.replace("beta", "bias")
-    if "stages" in name:
-        name = name.replace("stages", "encoder.stages")
-    if "norm" in name:
-        name = name.replace("norm", "layernorm")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def convert_preprocessor(checkpoint_url):
-    if "224" in checkpoint_url:
-        size = 224
-        crop_pct = 224 / 256
-    elif "384" in checkpoint_url:
-        size = 384
-        crop_pct = None
-    else:
-        size = 512
-        crop_pct = None
-
-    return ConvNextImageProcessor(
-        size=size,
-        crop_pct=crop_pct,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
-        resample=PILImageResampling.BICUBIC,
-    )
-
-
-@torch.no_grad()
-def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ConvNeXTV2 structure.
-    """
-    print("Downloading original model from checkpoint...")
-    # define ConvNeXTV2 configuration based on URL
-    config, expected_shape = get_convnextv2_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-
-    print("Converting model parameters...")
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if not key.startswith("classifier"):
-            key = "convnextv2." + key
-        state_dict[key] = val
-
-    # load HuggingFace model
-    model = ConvNextV2ForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image, prepared by ConvNextImageProcessor
-    preprocessor = convert_preprocessor(checkpoint_url)
-    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
-    logits = model(**inputs).logits
-
-    # note: the logits below were obtained without center cropping
-    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt":
-        expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt":
-        expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt":
-        expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt":
-        expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt":
-        expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt":
-        expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826])
-    else:
-        raise ValueError(f"Unknown URL: {checkpoint_url}")
-
-    assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3)
-    assert logits.shape == expected_shape
-    print("Model outputs match the original results!")
-
-    if save_model:
-        print("Saving model to local...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-    model_name = "convnextv2"
-    if "atto" in checkpoint_url:
-        model_name += "-atto"
-    if "femto" in checkpoint_url:
-        model_name += "-femto"
-    if "pico" in checkpoint_url:
-        model_name += "-pico"
-    if "nano" in checkpoint_url:
-        model_name += "-nano"
-    elif "tiny" in checkpoint_url:
-        model_name += "-tiny"
-    elif "base" in checkpoint_url:
-        model_name += "-base"
-    elif "large" in checkpoint_url:
-        model_name += "-large"
-    elif "huge" in checkpoint_url:
-        model_name += "-huge"
-    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
-        model_name += "-22k"
-    elif "22k" in checkpoint_url and "1k" in checkpoint_url:
-        model_name += "-22k-1k"
-    elif "1k" in checkpoint_url:
-        model_name += "-1k"
-    if "224" in checkpoint_url:
-        model_name += "-224"
-    elif "384" in checkpoint_url:
-        model_name += "-384"
-    elif "512" in checkpoint_url:
-        model_name += "-512"
-
-    if push_to_hub:
-        print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(model_name)
-        preprocessor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt",
-        type=str,
-        help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_convnextv2_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
-    )
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 9f76c92887f4..000000000000
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert CvT checkpoints from the original repository.
-
-URL: https://github.com/microsoft/CvT"""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification
-
-
-def embeddings(idx):
-    """
-    The function helps in renaming embedding layer weights.
-
-    Args:
-        idx: stage number in original model
-    """
-    embed = []
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight",
-            f"stage{idx}.patch_embed.proj.weight",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias",
-            f"stage{idx}.patch_embed.proj.bias",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight",
-            f"stage{idx}.patch_embed.norm.weight",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias",
-            f"stage{idx}.patch_embed.norm.bias",
-        )
-    )
-    return embed
-
-
-def attention(idx, cnt):
-    """
-    The function helps in renaming attention block layers weights.
-
-    Args:
-        idx: stage number in original model
-        cnt: count of blocks in each stage
-    """
-    attention_weights = []
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_q.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_q.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_k.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_k.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_v.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_v.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj.bias",
-        )
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias")
-    )
-    return attention_weights
-
-
-def cls_token(idx):
-    """
-    Function helps in renaming cls_token weights
-    """
-    token = []
-    token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token"))
-    return token
-
-
-def final():
-    """
-    Function helps in renaming final classification layer
-    """
-    head = []
-    head.append(("layernorm.weight", "norm.weight"))
-    head.append(("layernorm.bias", "norm.bias"))
-    head.append(("classifier.weight", "head.weight"))
-    head.append(("classifier.bias", "head.bias"))
-    return head
-
-
-def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
-    """
-    Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
-    """
-    img_labels_file = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    # For depth size 13 (13 = 1+2+10)
-    if cvt_model.rsplit("/", 1)[-1][4:6] == "13":
-        config.depth = [1, 2, 10]
-
-    # For depth size 21 (21 = 1+4+16)
-    elif cvt_model.rsplit("/", 1)[-1][4:6] == "21":
-        config.depth = [1, 4, 16]
-
-    # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20)
-    else:
-        config.depth = [2, 2, 20]
-        config.num_heads = [3, 12, 16]
-        config.embed_dim = [192, 768, 1024]
-
-    model = CvtForImageClassification(config)
-    image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-    image_processor.size["shortest_edge"] = image_size
-    original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))
-
-    huggingface_weights = OrderedDict()
-    list_of_state_dict = []
-
-    for idx in range(len(config.depth)):
-        if config.cls_token[idx]:
-            list_of_state_dict = list_of_state_dict + cls_token(idx)
-        list_of_state_dict = list_of_state_dict + embeddings(idx)
-        for cnt in range(config.depth[idx]):
-            list_of_state_dict = list_of_state_dict + attention(idx, cnt)
-
-    list_of_state_dict = list_of_state_dict + final()
-    for gg in list_of_state_dict:
-        print(gg)
-    for i in range(len(list_of_state_dict)):
-        huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]]
-
-    model.load_state_dict(huggingface_weights)
-    model.save_pretrained(pytorch_dump_folder)
-    image_processor.save_pretrained(pytorch_dump_folder)
-
-
-# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--cvt_model",
-        default="cvt-w24",
-        type=str,
-        help="Name of the cvt model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--image_size",
-        default=384,
-        type=int,
-        help="Input Image Size",
-    )
-    parser.add_argument(
-        "--cvt_file_name",
-        default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
-        type=str,
-        help="Input Image Size",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/dac/convert_dac_checkpoint.py b/src/transformers/models/dac/convert_dac_checkpoint.py
deleted file mode 100644
index bfeb96fbdd4e..000000000000
--- a/src/transformers/models/dac/convert_dac_checkpoint.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import fnmatch
-import re
-
-import torch
-
-from transformers import (
-    DacConfig,
-    DacFeatureExtractor,
-    DacModel,
-    logging,
-)
-
-
-# checkpoints downloaded using:
-# pip install descript-audio-codec
-# python3 -m dac download # downloads the default 44kHz variant
-# python3 -m dac download --model_type 44khz # downloads the 44kHz variant
-# python3 -m dac download --model_type 24khz # downloads the 24kHz variant
-# python3 -m dac download --model_type 16khz # downloads the 16kHz variant
-# More informations: https://github.com/descriptinc/descript-audio-codec/tree/main
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.dac")
-
-
-def match_pattern(string, pattern):
-    # Split the pattern into parts
-    pattern_parts = pattern.split(".")
-    string_parts = string.split(".")
-
-    pattern_block_count = string_block_count = 0
-
-    for part in pattern_parts:
-        if part.startswith("block"):
-            pattern_block_count += 1
-
-    for part in string_parts:
-        if part.startswith("block"):
-            string_block_count += 1
-
-    return fnmatch.fnmatch(string, pattern) and string_block_count == pattern_block_count
-
-
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-MAPPING_ENCODER = {
-    "encoder.block.0": ["encoder.conv1"],
-    "encoder.block.5": ["encoder.snake1"],
-    "encoder.block.6": ["encoder.conv2"],
-    "encoder.block.*.block.*.block.0".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake1"],
-    "encoder.block.*.block.*.block.1".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv1"],
-    "encoder.block.*.block.*.block.2".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake2"],
-    "encoder.block.*.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv2"],
-    "encoder.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "snake1"],
-    "encoder.block.*.block.4".replace("*", r"\d+"): ["encoder.block", "conv1"],
-}
-
-MAPPING_QUANTIZER = {
-    "quantizer.quantizers.*": ["quantizer.quantizers.*"],
-}
-
-MAPPING_DECODER = {
-    "decoder.model.0": ["decoder.conv1"],
-    "decoder.model.5": ["decoder.snake1"],
-    "decoder.model.6": ["decoder.conv2"],
-    "decoder.model.*.block.0".replace("*", r"\d+"): ["decoder.block", "snake1"],
-    "decoder.model.*.block.1".replace("*", r"\d+"): ["decoder.block", "conv_t1"],
-    "decoder.model.*.block.*.block.0".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake1"],
-    "decoder.model.*.block.*.block.1".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv1"],
-    "decoder.model.*.block.*.block.2".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake2"],
-    "decoder.model.*.block.*.block.3".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv2"],
-}
-
-
-MAPPING = {
-    **MAPPING_ENCODER,
-    **MAPPING_QUANTIZER,
-    **MAPPING_DECODER,
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "alpha":
-        hf_pointer.alpha.data = value
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(orig_dict, hf_model, model_name):
-    unused_weights = []
-
-    if model_name not in ["dac_16khz", "dac_24khz", "dac_44khz"]:
-        raise ValueError(f"Unsupported model: {model_name}")
-
-    for name, value in orig_dict.items():
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            regex = re.compile(key)
-            if regex.search(name):
-                if len(mapped_key) == 1:
-                    if mapped_key[0][0] == "q":
-                        mapped_key = ".".join(name.split(".")[:-1])
-                    else:
-                        mapped_key = mapped_key[0]
-                elif len(mapped_key) == 3:
-                    integers = re.findall(r"\b\d+\b", name)
-                    if mapped_key[0][0] == "d":
-                        mapped_key = "{}.{}.{}{}.{}".format(
-                            mapped_key[0],
-                            str(int(integers[0]) - 1),
-                            mapped_key[1],
-                            str(int(integers[1]) - 1),
-                            mapped_key[2],
-                        )
-                    else:
-                        mapped_key = "{}.{}.{}{}.{}".format(
-                            mapped_key[0],
-                            str(int(integers[0]) - 1),
-                            mapped_key[1],
-                            str(int(integers[1]) + 1),
-                            mapped_key[2],
-                        )
-                elif len(mapped_key) == 2:
-                    integers = re.findall(r"\b\d+\b", name)
-                    mapped_key = "{}.{}.{}".format(mapped_key[0], str(int(integers[0]) - 1), mapped_key[1])
-
-                is_used = True
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "alpha" in name:
-                    weight_type = "alpha"
-                elif "weight" in name:
-                    weight_type = "weight"
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-
-        if not is_used:
-            unused_weights.append(name)
-
-    print(list(set(unused_weights)))
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    model_name,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    sample_rate=16000,
-    repo_id=None,
-):
-    model_dict = torch.load(checkpoint_path, "cpu")
-
-    config = DacConfig()
-
-    metadata = model_dict["metadata"]["kwargs"]
-    config.encoder_hidden_size = metadata["encoder_dim"]
-    config.downsampling_ratios = metadata["encoder_rates"]
-    config.codebook_size = metadata["codebook_size"]
-    config.n_codebooks = metadata["n_codebooks"]
-    config.codebook_dim = metadata["codebook_dim"]
-    config.decoder_hidden_size = metadata["decoder_dim"]
-    config.upsampling_ratios = metadata["decoder_rates"]
-    config.quantizer_dropout = float(metadata["quantizer_dropout"])
-    config.sampling_rate = sample_rate
-
-    model = DacModel(config)
-    feature_extractor = DacFeatureExtractor()
-    feature_extractor.sampling_rate = sample_rate
-
-    original_checkpoint = model_dict["state_dict"]
-
-    model.apply_weight_norm()
-    recursively_load_weights(original_checkpoint, model, model_name)
-    model.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        default="dac_44khz",
-        type=str,
-        help="The model to convert. Should be one of 'dac_16khz', 'dac_24khz', 'dac_44khz'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
-    args = parser.parse_args()
-
-    convert_checkpoint(
-        args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.sample_rate, args.push_to_hub
-    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 5339f1671b07..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import os
-from functools import reduce
-
-import fairseq
-import torch
-from datasets import load_dataset
-
-from transformers import Wav2Vec2Processor, logging
-from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
-
-# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
-from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "models.0.layer_norm": "feature_projection.layer_norm",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    if not is_headless:
-        feature_extractor = hf_model.data2vec_audio.feature_extractor
-        pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
-
-    else:
-        feature_extractor = hf_model.feature_extractor
-        pos_conv_embedding = hf_model.encoder.pos_conv_embed
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-            )
-            is_used = True
-        elif "pos_conv" in name:
-            load_pos_conv_layer(
-                name,
-                value,
-                pos_conv_embedding,
-                unused_weights,
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if not is_headless:
-                    mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def access_by_string(module, path):
-    names = path.split(".")
-    return reduce(getattr, names, module)
-
-
-def set_weights(full_name, module, fsq_value, hf_weight_path):
-    hf_weight = access_by_string(module, hf_weight_path)
-    hf_value = hf_weight.data
-
-    if fsq_value.shape != hf_value.shape:
-        raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
-    hf_weight.data = fsq_value
-    logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    weight_type = name.split(".")[-1]
-    if type_id == 0:
-        layer_type = "conv"
-    elif type_id == 2:
-        layer_type = "layer_norm"
-    else:
-        unused_weights.append(full_name)
-        return
-
-    set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
-
-
-def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
-    name = full_name.split("pos_conv.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    weight_type = name.split(".")[-1]
-    if type_id != 0:
-        unused_weights.append(full_name)
-        return
-    else:
-        layer_type = "conv"
-
-    set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Data2VecAudioConfig.from_pretrained(config_path)
-    else:
-        config = Data2VecAudioConfig()
-
-    if not is_finetuned:
-        # Modify final_proj layer name
-        hf_wav2vec = Data2VecAudioModel(config)
-        data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
-
-        state_dict = torch.load(checkpoint_path)
-        state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
-        state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
-        converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
-        torch.save(state_dict, converted_ckpt)
-    else:
-        hf_wav2vec = Data2VecAudioForCTC(config)
-        converted_ckpt = checkpoint_path
-
-    def load_data2vec(path):
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
-        return model[0].eval()
-
-    model = load_data2vec(converted_ckpt)
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
-
-    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
-    input_audio = [x["array"] for x in ds[:4]["audio"]]
-
-    inputs = processor(input_audio, return_tensors="pt", padding=True)
-
-    input_values = inputs.input_values
-    attention_mask = inputs.attention_mask
-    #    input_values = inputs.input_values[:, :-1]
-    #    attention_mask = inputs.attention_mask[:, :-1]
-
-    hf_wav2vec.eval()
-    model.eval()
-    if is_finetuned:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
-            "encoder_out"
-        ].transpose(0, 1)
-        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
-
-        pred_ids = torch.argmax(our_output, dim=-1)
-        output_string = processor.batch_decode(pred_ids)
-
-        print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
-    else:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
-            "layer_results"
-        ][-1][0].transpose(0, 1)
-        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
-
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-    if is_finetuned:
-        processor.save_pretrained(pytorch_dump_folder_path)
-    else:
-        processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 10b97dc93d0a..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert data2vec checkpoint."""
-
-import argparse
-import os
-import pathlib
-
-import fairseq
-import torch
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import (
-    Data2VecTextConfig,
-    Data2VecTextForMaskedLM,
-    Data2VecTextForSequenceClassification,
-    Data2VecTextModel,
-)
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-
-# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
-# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_data2vec_checkpoint_to_pytorch(
-    data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak data2vec's weights to our BERT structure.
-    """
-    data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
-    data2vec = Data2VecTextModel.from_pretrained(
-        data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
-    )
-    data2vec.eval()  # disable dropout
-    data2vec_model = data2vec.models[0]
-    data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
-    config = Data2VecTextConfig(
-        vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=data2vec_model.args.encoder_embed_dim,
-        num_hidden_layers=data2vec_model.args.encoder_layers,
-        num_attention_heads=data2vec_model.args.encoder_attention_heads,
-        intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our BERT config:", config)
-
-    model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
-    model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
-    model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.data2vec_text.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
-    model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.data2vec_text.encoder.layer[i]
-        data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.k_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-        assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.q_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-        assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.v_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-
-        self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert (
-            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
-        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
-        self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
-        self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert (
-            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
-        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
-        intermediate.dense.weight = data2vec_layer.fc1.weight
-        intermediate.dense.bias = data2vec_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert (
-            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
-        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
-        bert_output.dense.weight = data2vec_layer.fc2.weight
-        bert_output.dense.bias = data2vec_layer.fc2.bias
-        bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
-    else:
-        their_output = data2vec_model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_data2vec_checkpoint_to_pytorch(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index 0c6f42f4ba7f..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import json
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm.models import create_model
-
-from transformers import (
-    BeitImageProcessor,
-    Data2VecVisionConfig,
-    Data2VecVisionForImageClassification,
-    Data2VecVisionModel,
-)
-
-
-def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + shared relative position bias + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", f"{hf_prefix}embeddings.mask_token"),
-                (
-                    "rel_pos_bias.relative_position_bias_table",
-                    f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",
-                ),
-                (
-                    "rel_pos_bias.relative_position_index",
-                    f"{hf_prefix}encoder.relative_position_bias.relative_position_index",
-                ),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    elif is_semantic:
-        # semantic segmentation classification heads
-        rename_keys.extend(
-            [
-                ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-                ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-                ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-                ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),
-                ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2
-
-        # relative_position bias table + index
-        if not has_lm_head:
-            # each layer has its own relative position bias
-            table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
-            index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
-
-            state_dict[
-                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
-            ] = table
-            state_dict[
-                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
-            ] = index
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        "Convert Data2VecVision to HF for image classification and pretraining", add_help=False
-    )
-    parser.add_argument("--hf_checkpoint_name", type=str)
-    parser.add_argument("--input_size", default=224, type=int, help="images input size")
-    parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")
-
-    return parser.parse_args()
-
-
-def load_beit_model(args, is_finetuned, is_large):
-    def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(model, prefix=prefix)
-
-        warn_missing_keys = []
-        ignore_missing_keys = []
-        for key in missing_keys:
-            keep_flag = True
-            for ignore_key in ignore_missing.split("|"):
-                if ignore_key in key:
-                    keep_flag = False
-                    break
-            if keep_flag:
-                warn_missing_keys.append(key)
-            else:
-                ignore_missing_keys.append(key)
-
-        missing_keys = warn_missing_keys
-
-        if len(missing_keys) > 0:
-            print(
-                "Weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, missing_keys
-                )
-            )
-        if len(unexpected_keys) > 0:
-            print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
-        if len(ignore_missing_keys) > 0:
-            print(
-                "Ignored weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, ignore_missing_keys
-                )
-            )
-        if len(error_msgs) > 0:
-            print("\n".join(error_msgs))
-
-    model_kwargs = {
-        "pretrained": False,
-        "use_shared_rel_pos_bias": True,
-        "use_abs_pos_emb": False,
-        "init_values": 0.1,
-    }
-
-    if is_finetuned:
-        model_kwargs.update(
-            {
-                "num_classes": 1000,
-                "use_mean_pooling": True,
-                "init_scale": 0.001,
-                "use_rel_pos_bias": True,
-            }
-        )
-
-    model = create_model(
-        "beit_large_patch16_224" if is_large else "beit_base_patch16_224",
-        **model_kwargs,
-    )
-    patch_size = model.patch_embed.patch_size
-    args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
-    checkpoint = torch.load(args.beit_checkpoint, map_location="cpu")
-
-    print(f"Load ckpt from {args.beit_checkpoint}")
-    checkpoint_model = None
-    for model_key in ("model", "module"):
-        if model_key in checkpoint:
-            checkpoint_model = checkpoint[model_key]
-            print(f"Load state_dict by model_key = {model_key}")
-            break
-
-    all_keys = list(checkpoint_model.keys())
-    for key in all_keys:
-        if "relative_position_index" in key:
-            checkpoint_model.pop(key)
-
-        if "relative_position_bias_table" in key:
-            rel_pos_bias = checkpoint_model[key]
-            src_num_pos, num_attn_heads = rel_pos_bias.size()
-            dst_num_pos, _ = model.state_dict()[key].size()
-            dst_patch_shape = model.patch_embed.patch_shape
-            if dst_patch_shape[0] != dst_patch_shape[1]:
-                raise NotImplementedError()
-
-    load_state_dict(model, checkpoint_model, prefix="")
-
-    return model
-
-
-def main():
-    args = get_args()
-
-    is_finetuned = "ft1k" in args.hf_checkpoint_name
-    is_large = "large" in args.hf_checkpoint_name
-
-    if is_finetuned:
-        # To convert Beit's data2vec_vision to HF you need to copy
-        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
-        # into this folder.
-        import modeling_finetune  # noqa: F401
-    else:
-        # To convert Beit's data2vec_vision to HF you need to copy
-        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
-        # into this folder
-        # IMPORTANT: Note that for now we've only converted the down-stream
-        # model and not the full pretrained model. This means for the integration
-        # test you need to add a `return x` after the following line:
-        # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
-        # to make the integration test pass.
-        import modeling_cyclical  # noqa: F401
-
-    # 1. Create model config
-    config = Data2VecVisionConfig()
-    if is_finetuned:
-        config.use_relative_position_bias = True
-        config.use_shared_relative_position_bias = False
-        config.use_mean_pooling = True
-        config.num_labels = 1000
-
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.use_relative_position_bias = False
-        config.use_shared_relative_position_bias = True
-        config.use_mean_pooling = False
-
-    if is_large:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # 2. Load Beit model
-    orig_model = load_beit_model(args, is_finetuned, is_large)
-    orig_model.eval()
-
-    # 3. Forward Beit model
-    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-    image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
-    with torch.no_grad():
-        orig_model_output = orig_model(*orig_args)
-
-    # 4. Load HF Data2VecVision model
-    if is_finetuned:
-        hf_model = Data2VecVisionForImageClassification(config)
-        hf_model.eval()
-        has_lm_head = False
-        hf_prefix = "data2vec_vision."
-    else:
-        hf_model = Data2VecVisionModel(config)
-        hf_model.eval()
-        has_lm_head = True
-        hf_prefix = ""
-
-    rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
-    state_dict = orig_model.state_dict()
-    for src, dest in rename_keys:
-        val = state_dict.pop(src)
-        state_dict[dest] = val
-
-    read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
-    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
-    print("HF missing", missing_keys)
-    print("HF unexpected_keys", unexpected_keys)
-
-    # 5. Forward HF Data2VecVision model
-    with torch.no_grad():
-        hf_model_output = hf_model(pixel_values)
-
-    hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state
-
-    # 6. Compare
-    max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()
-
-    print(f"max_absolute_diff = {max_absolute_diff}")
-    success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    # 7. Save
-    print(f"Saving to {args.hf_checkpoint_name}")
-    hf_model.save_pretrained(args.hf_checkpoint_name)
-    image_processor.save_pretrained(args.hf_checkpoint_name)
-
-
-if __name__ == "__main__":
-    main()
-    # Run the following to convert checkpoints
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./pretrained_base.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-base"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./finetuned_base.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-base-ft1k"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./pretrained_large.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-large"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./finetuned_large.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-large-ft1k"
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
deleted file mode 100644
index 781b823e96f3..000000000000
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Deformable DETR checkpoints."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_key(orig_key):
-    if "backbone.0.body" in orig_key:
-        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
-    if "transformer" in orig_key:
-        orig_key = orig_key.replace("transformer.", "")
-    if "norm1" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
-    if "norm2" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm2", "final_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
-    if "norm3" in orig_key:
-        orig_key = orig_key.replace("norm3", "final_layer_norm")
-    if "linear1" in orig_key:
-        orig_key = orig_key.replace("linear1", "fc1")
-    if "linear2" in orig_key:
-        orig_key = orig_key.replace("linear2", "fc2")
-    if "query_embed" in orig_key:
-        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
-    if "cross_attn" in orig_key:
-        orig_key = orig_key.replace("cross_attn", "encoder_attn")
-
-    return orig_key
-
-
-def read_in_q_k_v(state_dict):
-    # transformer decoder self-attention layers
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deformable_detr_checkpoint(
-    checkpoint_path,
-    single_scale,
-    dilation,
-    with_box_refine,
-    two_stage,
-    pytorch_dump_folder_path,
-    push_to_hub,
-):
-    """
-    Copy/paste/tweak model's weights to our Deformable DETR structure.
-    """
-
-    # load default config
-    config = DeformableDetrConfig()
-    # set config attributes
-    if single_scale:
-        config.num_feature_levels = 1
-    config.dilation = dilation
-    config.with_box_refine = with_box_refine
-    config.two_stage = two_stage
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    image_processor = DeformableDetrImageProcessor(format="coco_detection")
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = DeformableDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    # verify our conversion
-    outputs = model(pixel_values.to(device))
-
-    expected_logits = torch.tensor(
-        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
-    )
-    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
-
-    if single_scale:
-        expected_logits = torch.tensor(
-            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
-        )
-        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
-
-    if single_scale and dilation:
-        expected_logits = torch.tensor(
-            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
-        )
-        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
-
-    if with_box_refine:
-        expected_logits = torch.tensor(
-            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
-        )
-        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
-
-    if with_box_refine and two_stage:
-        expected_logits = torch.tensor(
-            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
-        )
-        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
-
-    print("Logits:", outputs.logits[0, :3, :3])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-
-    print("Everything ok!")
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        model_name = "deformable-detr"
-        model_name += "-single-scale" if single_scale else ""
-        model_name += "-dc5" if dilation else ""
-        model_name += "-with-box-refine" if with_box_refine else ""
-        model_name += "-two-stage" if two_stage else ""
-        print("Pushing model to hub...")
-        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth",
-        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
-    )
-    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
-    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
-    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
-    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deformable_detr_checkpoint(
-        args.checkpoint_path,
-        args.single_scale,
-        args.dilation,
-        args.with_box_refine,
-        args.two_stage,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
deleted file mode 100644
index e7bf3e7a12e8..000000000000
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DeiT distilled checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "deit.embeddings.cls_token"),
-            ("dist_token", "deit.embeddings.distillation_token"),
-            ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "deit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "deit" from all keys that start with "deit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification heads
-        rename_keys.extend(
-            [
-                ("norm.weight", "deit.layernorm.weight"),
-                ("norm.bias", "deit.layernorm.bias"),
-                ("head.weight", "cls_classifier.weight"),
-                ("head.bias", "cls_classifier.bias"),
-                ("head_dist.weight", "distillation_classifier.weight"),
-                ("head_dist.bias", "distillation_classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "deit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our DeiT structure.
-    """
-
-    # define default DeiT configuration
-    config = DeiTConfig()
-    # all deit models have fine-tuned heads
-    base_model = False
-    # dataset (fine-tuned on ImageNet 2012), patch_size and image_size
-    config.num_labels = 1000
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.patch_size = int(deit_name[-6:-4])
-    config.image_size = int(deit_name[-3:])
-    # size of the architecture
-    if deit_name[9:].startswith("tiny"):
-        config.hidden_size = 192
-        config.intermediate_size = 768
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 3
-    elif deit_name[9:].startswith("small"):
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-    if deit_name[9:].startswith("base"):
-        pass
-    elif deit_name[4:].startswith("large"):
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # load original model from timm
-    timm_model = timm.create_model(deit_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    model = DeiTForImageClassificationWithTeacher(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by DeiTImageProcessor
-    size = int(
-        (256 / 224) * config.image_size
-    )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
-    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    timm_logits = timm_model(pixel_values)
-    assert timm_logits.shape == outputs.logits.shape
-    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--deit_name",
-        default="vit_deit_base_distilled_patch16_224",
-        type=str,
-        help="Name of the DeiT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
deleted file mode 100644
index e2f64e9c3cd1..000000000000
--- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Bort checkpoint."""
-
-import argparse
-import os
-
-import gluonnlp as nlp
-import mxnet as mx
-import numpy as np
-import torch
-from gluonnlp.base import get_home_dir
-from gluonnlp.model.bert import BERTEncoder
-from gluonnlp.model.utils import _load_vocab
-from gluonnlp.vocab import Vocab
-from packaging import version
-from torch import nn
-
-from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-if version.parse(nlp.__version__) != version.parse("0.8.3"):
-    raise Exception("requires gluonnlp == 0.8.3")
-
-if version.parse(mx.__version__) != version.parse("1.5.0"):
-    raise Exception("requires mxnet == 1.5.0")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"
-
-
-def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
-    """
-    Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
-    """
-
-    # Original Bort configuration
-    bort_4_8_768_1024_hparams = {
-        "attention_cell": "multi_head",
-        "num_layers": 4,
-        "units": 1024,
-        "hidden_size": 768,
-        "max_length": 512,
-        "num_heads": 8,
-        "scaled": True,
-        "dropout": 0.1,
-        "use_residual": True,
-        "embed_size": 1024,
-        "embed_dropout": 0.1,
-        "word_embed": None,
-        "layer_norm_eps": 1e-5,
-        "token_type_vocab_size": 2,
-    }
-
-    predefined_args = bort_4_8_768_1024_hparams
-
-    # Let's construct the original Bort model here
-    # Taken from official BERT implementation, see:
-    # https://github.com/alexa/bort/blob/master/bort/bort.py
-    encoder = BERTEncoder(
-        attention_cell=predefined_args["attention_cell"],
-        num_layers=predefined_args["num_layers"],
-        units=predefined_args["units"],
-        hidden_size=predefined_args["hidden_size"],
-        max_length=predefined_args["max_length"],
-        num_heads=predefined_args["num_heads"],
-        scaled=predefined_args["scaled"],
-        dropout=predefined_args["dropout"],
-        output_attention=False,
-        output_all_encodings=False,
-        use_residual=predefined_args["use_residual"],
-        activation=predefined_args.get("activation", "gelu"),
-        layer_norm_eps=predefined_args.get("layer_norm_eps", None),
-    )
-
-    # Vocab information needs to be fetched first
-    # It's the same as RoBERTa, so RobertaTokenizer can be used later
-    vocab_name = "openwebtext_ccnews_stories_books_cased"
-
-    # Specify download folder to Gluonnlp's vocab
-    gluon_cache_dir = os.path.join(get_home_dir(), "models")
-    bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)
-
-    original_bort = nlp.model.BERTModel(
-        encoder,
-        len(bort_vocab),
-        units=predefined_args["units"],
-        embed_size=predefined_args["embed_size"],
-        embed_dropout=predefined_args["embed_dropout"],
-        word_embed=predefined_args["word_embed"],
-        use_pooler=False,
-        use_token_type_embed=False,
-        token_type_vocab_size=predefined_args["token_type_vocab_size"],
-        use_classifier=False,
-        use_decoder=False,
-    )
-
-    original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
-    params = original_bort._collect_params_with_prefix()
-
-    # Build our config 🤗
-    hf_bort_config_json = {
-        "architectures": ["BertForMaskedLM"],
-        "attention_probs_dropout_prob": predefined_args["dropout"],
-        "hidden_act": "gelu",
-        "hidden_dropout_prob": predefined_args["dropout"],
-        "hidden_size": predefined_args["embed_size"],
-        "initializer_range": 0.02,
-        "intermediate_size": predefined_args["hidden_size"],
-        "layer_norm_eps": predefined_args["layer_norm_eps"],
-        "max_position_embeddings": predefined_args["max_length"],
-        "model_type": "bort",
-        "num_attention_heads": predefined_args["num_heads"],
-        "num_hidden_layers": predefined_args["num_layers"],
-        "pad_token_id": 1,  # 2 = BERT, 1 = RoBERTa
-        "type_vocab_size": 1,  # 2 = BERT, 1 = RoBERTa
-        "vocab_size": len(bort_vocab),
-    }
-
-    hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
-    hf_bort_model = BertForMaskedLM(hf_bort_config)
-    hf_bort_model.eval()
-
-    # Parameter mapping table (Gluonnlp to Transformers)
-    # * denotes layer index
-    #
-    # | Gluon Parameter                                                | Transformers Parameter
-    # | -------------------------------------------------------------- | ----------------------
-    # | `encoder.layer_norm.beta`                                      | `bert.embeddings.LayerNorm.bias`
-    # | `encoder.layer_norm.gamma`                                     | `bert.embeddings.LayerNorm.weight`
-    # | `encoder.position_weight`                                      | `bert.embeddings.position_embeddings.weight`
-    # | `word_embed.0.weight`                                          | `bert.embeddings.word_embeddings.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_key.bias`     | `bert.encoder.layer.*.attention.self.key.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_key.weight`   | `bert.encoder.layer.*.attention.self.key.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_query.bias`   | `bert.encoder.layer.*.attention.self.query.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_value.bias`   | `bert.encoder.layer.*.attention.self.value.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
-    # | `encoder.transformer_cells.*.ffn.ffn_2.bias`                   | `bert.encoder.layer.*.attention.output.dense.bias`
-    # | `encoder.transformer_cells.*.ffn.ffn_2.weight`                 | `bert.encoder.layer.*.attention.output.dense.weight`
-    # | `encoder.transformer_cells.*.layer_norm.beta`                  | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
-    # | `encoder.transformer_cells.*.layer_norm.gamma`                 | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
-    # | `encoder.transformer_cells.*.ffn.ffn_1.bias`                   | `bert.encoder.layer.*.intermediate.dense.bias`
-    # | `encoder.transformer_cells.*.ffn.ffn_1.weight`                 | `bert.encoder.layer.*.intermediate.dense.weight`
-    # | `encoder.transformer_cells.*.ffn.layer_norm.beta`              | `bert.encoder.layer.*.output.LayerNorm.bias`
-    # | `encoder.transformer_cells.*.ffn.layer_norm.gamma`             | `bert.encoder.layer.*.output.LayerNorm.weight`
-    # | `encoder.transformer_cells.*.proj.bias`                        | `bert.encoder.layer.*.output.dense.bias`
-    # | `encoder.transformer_cells.*.proj.weight`                      | `bert.encoder.layer.*.output.dense.weight`
-
-    # Helper function to convert MXNET Arrays to PyTorch
-    def to_torch(mx_array) -> nn.Parameter:
-        return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
-
-    # Check param shapes and map new HF param back
-    def check_and_map_params(hf_param, gluon_param):
-        shape_hf = hf_param.shape
-
-        gluon_param = to_torch(params[gluon_param])
-        shape_gluon = gluon_param.shape
-
-        assert (
-            shape_hf == shape_gluon
-        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
-
-        return gluon_param
-
-    hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
-    )
-    hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
-    )
-    hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
-        hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
-    )
-    hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
-    )
-
-    # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
-    hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
-    )
-
-    for i in range(hf_bort_config.num_hidden_layers):
-        layer: BertLayer = hf_bort_model.bert.encoder.layer[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-
-        self_attn.key.bias.data = check_and_map_params(
-            self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
-        )
-
-        self_attn.key.weight.data = check_and_map_params(
-            self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
-        )
-        self_attn.query.bias.data = check_and_map_params(
-            self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
-        )
-        self_attn.query.weight.data = check_and_map_params(
-            self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
-        )
-        self_attn.value.bias.data = check_and_map_params(
-            self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
-        )
-        self_attn.value.weight.data = check_and_map_params(
-            self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
-        )
-
-        # self attention output
-        self_output: BertSelfOutput = layer.attention.output
-
-        self_output.dense.bias = check_and_map_params(
-            self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
-        )
-        self_output.dense.weight = check_and_map_params(
-            self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
-        )
-        self_output.LayerNorm.bias = check_and_map_params(
-            self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
-        )
-        self_output.LayerNorm.weight = check_and_map_params(
-            self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
-        )
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-
-        intermediate.dense.bias = check_and_map_params(
-            intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
-        )
-        intermediate.dense.weight = check_and_map_params(
-            intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
-        )
-
-        # output
-        bert_output: BertOutput = layer.output
-
-        bert_output.dense.bias = check_and_map_params(
-            bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
-        )
-        bert_output.dense.weight = check_and_map_params(
-            bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
-        )
-        bert_output.LayerNorm.bias = check_and_map_params(
-            bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
-        )
-        bert_output.LayerNorm.weight = check_and_map_params(
-            bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
-        )
-
-    # Save space and energy 🎄
-    hf_bort_model.half()
-
-    # Compare output of both models
-    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
-
-    input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
-
-    # Get gluon output
-    gluon_input_ids = mx.nd.array([input_ids])
-    output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])
-
-    # Get Transformer output (save and reload model again)
-    hf_bort_model.save_pretrained(pytorch_dump_folder_path)
-    hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
-    hf_bort_model.eval()
-
-    input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
-    output_hf = hf_bort_model(**input_ids)[0]
-
-    gluon_layer = output_gluon[0].asnumpy()
-    hf_layer = output_hf[0].detach().numpy()
-
-    max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
-    success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
-
-    if success:
-        print("✔️ Both model do output the same tensors")
-    else:
-        print("❌ Both model do **NOT** output the same tensors")
-        print("Absolute difference is:", max_absolute_diff)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
deleted file mode 100644
index 60e93efe7c60..000000000000
--- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETA checkpoints from the original repository.
-
-URL: https://github.com/jozhang97/DETA/tree/master"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_deta_config():
-    config = DetaConfig(
-        num_queries=900,
-        encoder_ffn_dim=2048,
-        decoder_ffn_dim=2048,
-        num_feature_levels=5,
-        assign_first_stage=True,
-        with_box_refine=True,
-        two_stage=True,
-    )
-
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-    # transformer encoder
-    for i in range(config.encoder_layers):
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
-
-    # transformer decoder
-    for i in range(config.decoder_layers):
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_decoder_q_k_v(state_dict, config):
-    # transformer decoder self-attention layers
-    hidden_size = config.d_model
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETA structure.
-    """
-
-    # load config
-    config = get_deta_config()
-
-    # load original state dict
-    if model_name == "deta-resnet-50":
-        filename = "adet_checkpoint0011.pth"
-    elif model_name == "deta-resnet-50-24-epochs":
-        filename = "adet_2x_checkpoint0023.pth"
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-    checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # fix some prefixes
-    for key in state_dict.copy().keys():
-        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
-        if "input_proj" in key:
-            val = state_dict.pop(key)
-            state_dict["model." + key] = val
-        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer", "model")] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetaForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-
-    # load image processor
-    processor = DetaImageProcessor(format="coco_detection")
-
-    # verify our conversion on image
-    img = prepare_img()
-    encoding = processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values.to(device))
-
-    # verify logits
-    if model_name == "deta-resnet-50":
-        expected_logits = torch.tensor(
-            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
-        )
-        expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
-    elif model_name == "deta-resnet-50-24-epochs":
-        expected_logits = torch.tensor(
-            [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
-        )
-        expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-    print("Everything ok!")
-
-    if pytorch_dump_folder_path:
-        # Save model and processor
-        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(f"jozhang97/{model_name}")
-        processor.push_to_hub(f"jozhang97/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="deta-resnet-50",
-        choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
deleted file mode 100644
index 392750fa67a1..000000000000
--- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETA checkpoints from the original repository.
-
-URL: https://github.com/jozhang97/DETA/tree/master"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_deta_config(model_name):
-    backbone_config = SwinConfig(
-        embed_dim=192,
-        depths=(2, 2, 18, 2),
-        num_heads=(6, 12, 24, 48),
-        window_size=12,
-        out_features=["stage2", "stage3", "stage4"],
-    )
-
-    config = DetaConfig(
-        backbone_config=backbone_config,
-        num_queries=900,
-        encoder_ffn_dim=2048,
-        decoder_ffn_dim=2048,
-        num_feature_levels=5,
-        assign_first_stage=True,
-        with_box_refine=True,
-        two_stage=True,
-    )
-
-    # set labels
-    repo_id = "huggingface/label-files"
-    if "o365" in model_name:
-        num_labels = 366
-        filename = "object365-id2label.json"
-    else:
-        num_labels = 91
-        filename = "coco-detection-id2label.json"
-
-    config.num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
-
-    rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
-    rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
-    rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
-    rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
-    rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
-    rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
-
-    # transformer encoder
-    for i in range(config.encoder_layers):
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
-
-    # transformer decoder
-    for i in range(config.decoder_layers):
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_swin_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-def read_in_decoder_q_k_v(state_dict, config):
-    # transformer decoder self-attention layers
-    hidden_size = config.d_model
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETA structure.
-    """
-
-    # load config
-    config = get_deta_config(model_name)
-
-    # load original state dict
-    if model_name == "deta-swin-large":
-        checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
-    elif model_name == "deta-swin-large-o365":
-        checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # original state dict
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_swin_q_k_v(state_dict, config.backbone_config)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # fix some prefixes
-    for key in state_dict.copy().keys():
-        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
-        if "input_proj" in key:
-            val = state_dict.pop(key)
-            state_dict["model." + key] = val
-        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer", "model")] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetaForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-
-    # load image processor
-    processor = DetaImageProcessor(format="coco_detection")
-
-    # verify our conversion on image
-    img = prepare_img()
-    encoding = processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values.to(device))
-
-    # verify logits
-    print("Logits:", outputs.logits[0, :3, :3])
-    print("Boxes:", outputs.pred_boxes[0, :3, :3])
-    if model_name == "deta-swin-large":
-        expected_logits = torch.tensor(
-            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
-        )
-        expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
-    elif model_name == "deta-swin-large-o365":
-        expected_logits = torch.tensor(
-            [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
-        )
-        expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-    print("Everything ok!")
-
-    if pytorch_dump_folder_path:
-        # Save model and processor
-        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(f"jozhang97/{model_name}")
-        processor.push_to_hub(f"jozhang97/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="deta-swin-large",
-        choices=["deta-swin-large", "deta-swin-large-o365"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 7431cd6136a5..000000000000
--- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert EfficientFormer checkpoints from the original repository.
-
-URL: https://github.com/snap-research/EfficientFormer
-"""
-
-import argparse
-import re
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
-
-from transformers import (
-    EfficientFormerConfig,
-    EfficientFormerForImageClassificationWithTeacher,
-    EfficientFormerImageProcessor,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-
-
-def rename_key(old_name, num_meta4D_last_stage):
-    new_name = old_name
-
-    if "patch_embed" in old_name:
-        _, layer, param = old_name.split(".")
-
-        if layer == "0":
-            new_name = old_name.replace("0", "convolution1")
-        elif layer == "1":
-            new_name = old_name.replace("1", "batchnorm_before")
-        elif layer == "3":
-            new_name = old_name.replace("3", "convolution2")
-        else:
-            new_name = old_name.replace("4", "batchnorm_after")
-
-    if "network" in old_name and re.search(r"\d\.\d", old_name):
-        two_digit_num = r"\b\d{2}\b"
-        if bool(re.search(two_digit_num, old_name)):
-            match = re.search(r"\d\.\d\d.", old_name).group()
-        else:
-            match = re.search(r"\d\.\d.", old_name).group()
-        if int(match[0]) < 6:
-            trimmed_name = old_name.replace(match, "")
-            trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
-            new_name = "intermediate_stages." + trimmed_name
-        else:
-            trimmed_name = old_name.replace(match, "")
-            if int(match[2]) < num_meta4D_last_stage:
-                trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
-            else:
-                layer_index = str(int(match[2]) - num_meta4D_last_stage)
-                trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
-                if "norm1" in old_name:
-                    trimmed_name = trimmed_name.replace("norm1", "layernorm1")
-                elif "norm2" in old_name:
-                    trimmed_name = trimmed_name.replace("norm2", "layernorm2")
-                elif "fc1" in old_name:
-                    trimmed_name = trimmed_name.replace("fc1", "linear_in")
-                elif "fc2" in old_name:
-                    trimmed_name = trimmed_name.replace("fc2", "linear_out")
-
-            new_name = "last_stage." + trimmed_name
-
-    elif "network" in old_name and re.search(r".\d.", old_name):
-        new_name = old_name.replace("network", "intermediate_stages")
-
-    if "fc" in new_name:
-        new_name = new_name.replace("fc", "convolution")
-    elif ("norm1" in new_name) and ("layernorm1" not in new_name):
-        new_name = new_name.replace("norm1", "batchnorm_before")
-    elif ("norm2" in new_name) and ("layernorm2" not in new_name):
-        new_name = new_name.replace("norm2", "batchnorm_after")
-    if "proj" in new_name:
-        new_name = new_name.replace("proj", "projection")
-    if "dist_head" in new_name:
-        new_name = new_name.replace("dist_head", "distillation_classifier")
-    elif "head" in new_name:
-        new_name = new_name.replace("head", "classifier")
-    elif "patch_embed" in new_name:
-        new_name = "efficientformer." + new_name
-    elif new_name == "norm.weight" or new_name == "norm.bias":
-        new_name = new_name.replace("norm", "layernorm")
-        new_name = "efficientformer." + new_name
-    else:
-        new_name = "efficientformer.encoder." + new_name
-
-    return new_name
-
-
-def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
-    for key in checkpoint.copy().keys():
-        val = checkpoint.pop(key)
-        checkpoint[rename_key(key, num_meta4D_last_stage)] = val
-
-    return checkpoint
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-def convert_efficientformer_checkpoint(
-    checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
-):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
-    model = EfficientFormerForImageClassificationWithTeacher(config)
-    model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
-
-    num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
-    new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    # prepare image
-    image = prepare_img()
-    image_size = 256
-    crop_size = 224
-    processor = EfficientFormerImageProcessor(
-        size={"shortest_edge": image_size},
-        crop_size={"height": crop_size, "width": crop_size},
-        resample=pillow_resamplings["bicubic"],
-    )
-    pixel_values = processor(images=image, return_tensors="pt").pixel_values
-
-    # original processing pipeline
-    image_transforms = Compose(
-        [
-            Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
-            CenterCrop(crop_size),
-            ToTensor(),
-            Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ]
-    )
-    original_pixel_values = image_transforms(image).unsqueeze(0)
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    expected_shape = (1, 1000)
-
-    if "l1" in model_name:
-        expected_logits = torch.Tensor(
-            [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
-        )
-        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
-        assert logits.shape == expected_shape
-    elif "l3" in model_name:
-        expected_logits = torch.Tensor(
-            [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
-        )
-        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
-        assert logits.shape == expected_shape
-    elif "l7" in model_name:
-        expected_logits = torch.Tensor(
-            [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
-        )
-        assert logits.shape == expected_shape
-    else:
-        raise ValueError(
-            f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7"
-        )
-
-    # Save Checkpoints
-    Path(pytorch_dump_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_path)
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-    processor.save_pretrained(pytorch_dump_path)
-    print(f"Processor successfuly saved at {pytorch_dump_path}")
-
-    if push_to_hub:
-        print("Pushing model to the hub...")
-
-        model.push_to_hub(
-            repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        processor.push_to_hub(
-            repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to EfficientFormer pytorch checkpoint.",
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for EfficientFormer model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-    parser.add_argument(
-        "--no-push_to_hub",
-        dest="push_to_hub",
-        action="store_false",
-        help="Do not push model and image processor to the hub",
-    )
-    parser.set_defaults(push_to_hub=True)
-
-    args = parser.parse_args()
-    convert_efficientformer_checkpoint(
-        checkpoint_path=args.pytorch_model_path,
-        efficientformer_config_file=args.config_file,
-        pytorch_dump_path=args.pytorch_dump_path,
-        push_to_hub=args.push_to_hub,
-    )
diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index a84d000d4439..000000000000
--- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model."""
-
-import argparse
-import json
-import os
-from collections import OrderedDict
-
-import numpy as np
-import tensorflow as tf
-import torch
-
-
-def convert_tf_gptsan_to_pt(args):
-    parameter_file = os.path.join(args.tf_model_dir, "parameters.json")
-    params = json.loads(open(parameter_file).read())
-    if not params:
-        raise ValueError(
-            f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file."
-        )
-    if not args.output.endswith(".pt"):
-        args.output = args.output + ".pt"
-    new_state = OrderedDict()
-    with tf.device("/CPU:0"):
-        reader = tf.train.load_checkpoint(args.tf_model_dir)
-        shapes = reader.get_variable_to_shape_map()
-        for key_name in shapes.keys():
-            vnp = reader.get_tensor(key_name).astype(np.float16)
-            if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
-                continue
-            if key_name.startswith("pasts/"):
-                if key_name.startswith("pasts/mlp"):
-                    player = int(key_name[9])
-                elif key_name.startswith("pasts/out"):
-                    player = 8
-                name = "model.sqout.%d.weight" % (player * 2)  # enter to nn.Sequencial with Tanh, so 2 at a time
-                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/moe"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/switch_gating/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/softmlp/kernel"):
-                    name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"):
-                    nlayer = key_name[-9:-7]
-                    for i in range(16):
-                        name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer)
-                        state = (
-                            vnp[i].transpose([1, 0]).copy()
-                        )  # In Mesh-Tensorflow, it is one array, so it is divided
-                        new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/mlp"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/p1/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p1/bias"):
-                    name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p2/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p2/bias"):
-                    name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/ln"):
-                player = int(key_name[8:].split("/")[0])
-                if key_name.endswith("/b"):
-                    name = "model.blocks.%d.feed_forward.norm.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/g"):
-                    name = "model.blocks.%d.feed_forward.norm.weight" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/att"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/qkv/kernel"):
-                    state = vnp.copy()  # Compute same dimension as Mesh-tensorflow using einsum
-                    state_q = state[:, 0, :, :]
-                    state_k = state[:, 1, :, :]
-                    state_v = state[:, 2, :, :]
-                    state_q = (
-                        state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    state_k = (
-                        state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    state_v = (
-                        state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player
-                    new_state[name] = torch.tensor(state_q)
-                    name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player
-                    new_state[name] = torch.tensor(state_k)
-                    name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player
-                    new_state[name] = torch.tensor(state_v)
-                elif key_name.endswith("/o/kernel"):
-                    name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player
-                    state = (
-                        vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/an"):
-                player = int(key_name[8:].split("/")[0])
-                if key_name.endswith("/b"):
-                    name = "model.blocks.%d.self_attn.norm.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/g"):
-                    name = "model.blocks.%d.self_attn.norm.weight" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif (
-                key_name.startswith("model/wte")
-                or key_name.startswith("model/wpe")
-                or key_name.startswith("model/ete")
-            ):
-                nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[
-                    key_name[-3:]
-                ]
-                name = "model.%s.weight" % nlayer
-                state = vnp.copy()  # same in embedded
-                new_state[name] = torch.tensor(state)
-                if key_name.startswith("model/wte"):
-                    name = "lm_head.weight"
-                    state = vnp.copy()  # same in embedded
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/wob"):
-                name = "final_logits_bias"
-                state = vnp.copy()  # same in embedded
-                state = state.reshape((1, -1))
-                new_state[name] = torch.tensor(state)
-            elif key_name == "model/dense/kernel":
-                name = "model.last_project.weight"
-                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                new_state[name] = torch.tensor(state)
-            elif key_name == "model/dense_1/bias":
-                name = "model.last_project.bias"
-                state = vnp.copy()  # same because it is one dimensional
-                new_state[name] = torch.tensor(state)
-    torch.save(new_state, args.output)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model")
-    parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model")
-    args = parser.parse_args()
-    convert_tf_gptsan_to_pt(args)
diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
deleted file mode 100644
index b56a25c57c70..000000000000
--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Jukebox checkpoints"""
-
-import argparse
-import json
-import os
-from pathlib import Path
-
-import requests
-import torch
-
-from transformers import JukeboxConfig, JukeboxModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-PREFIX = "https://openaipublic.azureedge.net/jukebox/models/"
-MODEL_MAPPING = {
-    "jukebox-1b-lyrics": [
-        "5b/vqvae.pth.tar",
-        "5b/prior_level_0.pth.tar",
-        "5b/prior_level_1.pth.tar",
-        "1b_lyrics/prior_level_2.pth.tar",
-    ],
-    "jukebox-5b-lyrics": [
-        "5b/vqvae.pth.tar",
-        "5b/prior_level_0.pth.tar",
-        "5b/prior_level_1.pth.tar",
-        "5b_lyrics/prior_level_2.pth.tar",
-    ],
-}
-
-
-def replace_key(key):
-    if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
-        key = key.replace(".model.1.bias", ".conv1d_1.bias")
-    elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
-        key = key.replace(".model.1.weight", ".conv1d_1.weight")
-    elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
-        key = key.replace(".model.3.bias", ".conv1d_2.bias")
-    elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
-        key = key.replace(".model.3.weight", ".conv1d_2.weight")
-
-    if "conditioner_blocks.0." in key:
-        key = key.replace("conditioner_blocks.0", "conditioner_blocks")
-
-    if "prime_prior" in key:
-        key = key.replace("prime_prior", "encoder")
-
-    if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key:
-        key = key.replace(".emb.", ".")
-
-    if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
-        return key.replace(".k", ".codebook")
-    if "y_emb." in key:
-        return key.replace("y_emb.", "metadata_embedding.")
-
-    if "x_emb.emb." in key:
-        key = key.replace("0.x_emb.emb", "embed_tokens")
-
-    if "prime_state_ln" in key:
-        return key.replace("prime_state_ln", "encoder.final_layer_norm")
-    if ".ln" in key:
-        return key.replace(".ln", ".layer_norm")
-    if "_ln" in key:
-        return key.replace("_ln", "_layer_norm")
-
-    if "prime_state_proj" in key:
-        return key.replace("prime_state_proj", "encoder.proj_in")
-    if "prime_x_out" in key:
-        return key.replace("prime_x_out", "encoder.lm_head")
-    if "prior.x_out" in key:
-        return key.replace("x_out", "fc_proj_out")
-    if "x_emb" in key:
-        return key.replace("x_emb", "embed_tokens")
-
-    return key
-
-
-def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
-    new_dict = {}
-    import re
-
-    re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
-    re_encoder_block_resnet = re.compile(
-        r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
-
-    re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
-    re_decoder_block_resnet = re.compile(
-        r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
-
-    re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
-    re_prior_cond_resnet = re.compile(
-        r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
-
-    for original_key, value in state_dict.items():
-        # rename vqvae.encoder keys
-        if re_encoder_block_conv_in.fullmatch(original_key):
-            regex_match = re_encoder_block_conv_in.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3])
-            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}"
-            key = re_encoder_block_conv_in.sub(re_new_key, original_key)
-
-        elif re_encoder_block_resnet.fullmatch(original_key):
-            regex_match = re_encoder_block_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3])
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_encoder_block_resnet.sub(re_new_key, original_key)
-
-        elif re_encoder_block_proj_out.fullmatch(original_key):
-            regex_match = re_encoder_block_proj_out.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}"
-            key = re_encoder_block_proj_out.sub(re_new_key, original_key)
-
-        # rename vqvae.decoder keys
-        elif re_decoder_block_conv_out.fullmatch(original_key):
-            regex_match = re_decoder_block_conv_out.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
-            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}"
-            key = re_decoder_block_conv_out.sub(re_new_key, original_key)
-
-        elif re_decoder_block_resnet.fullmatch(original_key):
-            regex_match = re_decoder_block_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_decoder_block_resnet.sub(re_new_key, original_key)
-
-        elif re_decoder_block_proj_in.fullmatch(original_key):
-            regex_match = re_decoder_block_proj_in.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}"
-            key = re_decoder_block_proj_in.sub(re_new_key, original_key)
-
-        # rename prior cond.model to upsampler.upsample_block and resnet
-        elif re_prior_cond_conv_out.fullmatch(original_key):
-            regex_match = re_prior_cond_conv_out.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
-            re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}"
-            key = re_prior_cond_conv_out.sub(re_new_key, original_key)
-
-        elif re_prior_cond_resnet.fullmatch(original_key):
-            regex_match = re_prior_cond_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_prior_cond_resnet.sub(re_new_key, original_key)
-
-        elif re_prior_cond_proj_in.fullmatch(original_key):
-            regex_match = re_prior_cond_proj_in.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}"
-            key = re_prior_cond_proj_in.sub(re_new_key, original_key)
-
-        # keep original key
-        else:
-            key = original_key
-
-        key = replace_key(key)
-
-        if f"{key_prefix}.{key}" not in model_state_dict or key is None:
-            print(f"failed converting {original_key} to {key}, does not match")
-
-        # handle missmatched shape
-        elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
-            val = model_state_dict[f"{key_prefix}.{key}"]
-            print(f"{original_key}-> {key} : \nshape {val.shape} and { value.shape}, do not match")
-            key = original_key
-
-        mapping[key] = original_key
-        new_dict[key] = value
-
-    return new_dict
-
-
-@torch.no_grad()
-def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
-    """
-    Copy/paste/tweak model's weights to our Jukebox structure.
-    """
-    for file in MODEL_MAPPING[model_name]:
-        if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"):
-            r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
-            os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True)
-            open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
-
-    model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
-
-    config = JukeboxConfig.from_pretrained(model_name)
-    model = JukeboxModel(config)
-
-    weight_dict = []
-    mapping = {}
-    for i, dict_name in enumerate(model_to_convert):
-        old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
-
-        new_dic = {}
-        for k in old_dic.keys():
-            if k.endswith(".b"):
-                new_dic[k.replace("b", "bias")] = old_dic[k]
-            elif k.endswith(".w"):
-                new_dic[k.replace("w", "weight")] = old_dic[k]
-            elif "level_2" not in dict_name and "cond.model." in k:
-                new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
-            else:
-                new_dic[k] = old_dic[k]
-
-        key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}"
-        new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
-        weight_dict.append(new_dic)
-
-    vqvae_state_dict = weight_dict.pop(0)
-    model.vqvae.load_state_dict(vqvae_state_dict)
-    for i in range(len(weight_dict)):
-        model.priors[i].load_state_dict(weight_dict[2 - i])
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
-        json.dump(mapping, txtfile)
-
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    return weight_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="jukebox-5b-lyrics",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="jukebox-5b-lyrics-converted",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    args = parser.parse_args()
-    convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 1f791dab2404..000000000000
--- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
-https://huggingface.co/mnaylor/mega-wikitext-103
-
-Requirements:
-  - clone the Mega repo and install fairseq from there
-    1. git clone https://github.com/facebookresearch/mega.git
-    2. cd mega && pip install -e
-  - clone the pretrained weights for the original implementation from the hugging face repo
-    * use this location as the path for pretrained weights
-"""
-
-import argparse
-
-# utilities to import the model weights and config file
-import os
-import pickle as pkl
-
-# PyTorch + new model classes
-import torch
-from torch import nn
-
-from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM
-
-
-# import the EncoderLayer class used to pretrain
-# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source
-try:
-    from fairseq.modules.mega_layer import MegaEncoderLayer
-except ImportError:
-    raise ImportError("You need to install the version of fairseq from the Mega repo!")
-
-
-# define the wrapper classes used to train the MLM  (see colab notebook below)
-# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing
-# MegaLM outputs hidden states
-class MegaLM(nn.Module):
-    "The base class for our Mega encoder - given input IDs, embed text and return encoder output"
-
-    def __init__(self, mega_args, depth, vocab_size):
-        super().__init__()
-        self.mega_args = mega_args
-        self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)
-        self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])
-        self.depth = depth
-
-    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
-        """
-        Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
-        tensors, and returns a tensor of size (batch, n_classes) containing classification logits
-
-        Other options:
-          - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
-            aligns with the HF tokenizer behavior)
-          - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
-            which aligns with HF tokenizer)
-        """
-
-        # Mega expects embeddings to be (time, batch, embedding size), but
-        # Hugging Face returns tokens as (batch, time)
-        if batch_first:
-            input_ids = input_ids.T
-
-        # to make things more confusing, Mega expects the attention mask to
-        # be (batch, time), but with values of 0 (normal token) and 1 (ignore token)
-        # which is the opposite of what HF returns
-        if ignore_mask_value == 0:
-            attention_mask = 1 - attention_mask
-
-        # get token embeddings from IDs
-        embeds = self.embedding_layer(input_ids)
-
-        # pass through the Mega layers
-        # input is (time, batch, encoder dim) and output is the same
-        for encoder in self.encoders:
-            embeds = encoder(embeds, attention_mask)
-
-        # return according to the shape specified
-        if batch_first:
-            # (T, B, H) --> (B, T, H)
-            return torch.transpose(embeds, 0, 1)
-        else:
-            return embeds
-
-
-# renamed from MegaForMaskedLM to avoid confusion with new module
-class OriginalMegaForMaskedLM(nn.Module):
-    "A wrapper class for doing masked language modeling with Mega"
-
-    def __init__(self, mega_args, depth, vocab_size):
-        super().__init__()
-        self.mega = MegaLM(mega_args, depth, vocab_size)
-        self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
-        self.dropout = nn.Dropout(p=0.1)
-
-    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
-        """
-        Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary
-        entry.
-
-        If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch
-        size, Sequence length, Vocab size); otherwise (S, B, V)
-        """
-        encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
-        return self.mlm_head(self.dropout(encoder_output))
-
-
-# code to convert the checkpoint located in the user-specified location
-def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
-    with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
-        mega_original_args = pkl.load(f)
-
-    # load the original encoder
-    original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()
-
-    # load its weights
-    print(
-        "Original Mega encoder:",
-        original_mlm.mega.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu")
-        ),
-    )
-    print(
-        "Original Mega MLM layer:",
-        original_mlm.mlm_head.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
-        ),
-    )
-
-    # create a new config from the old one
-    hf_config = MegaConfig(
-        num_hidden_layers=mega_original_args["depth"],
-        vocab_size=mega_original_args["vocab_size"],
-        hidden_size=mega_original_args["mega_args"].encoder_embed_dim,
-        shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,
-        intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,
-        ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,
-        dropout_prob=mega_original_args["mega_args"].dropout,
-        attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,
-        hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,
-        activation=mega_original_args["mega_args"].activation_fn,
-        attention_activation=mega_original_args["mega_args"].attention_activation_fn,
-        bidirectional=mega_original_args["mega_args"].bidirectional,
-        use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,
-        chunk_size=mega_original_args["mega_args"].encoder_chunk_size,
-        truncation=mega_original_args["mega_args"].truncation_length,
-        normalization_type=mega_original_args["mega_args"].normalization_type,
-        normalize_before_mega=True,
-        norm_affine=True,
-        use_feature_dropout=mega_original_args["mega_args"].feature_dropout,
-        relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,
-        max_positions=mega_original_args["mega_args"].max_source_positions,
-        nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,
-        normalize_before_ffn=mega_original_args["mega_args"].normalize_before,
-        # new arguments added for HF implementation
-        nffn_activation_dropout_prob=0.0,
-        add_token_type_embeddings=False,
-        add_lm_hidden_dense_layer=False,
-    )
-
-    hf_mlm = MegaForMaskedLM(hf_config).eval()
-
-    # the originl checkpoint just uses nn.Embedding for the word embeddings
-    # we use a wrapper module for embeddings to add support for positional embeddings
-    hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight
-
-    # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face
-    # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained,
-    # also renaming previously confusing parameter names
-    original_state_dict = original_mlm.mega.encoders.state_dict()
-    updated_keys = {}
-    for module_name in original_state_dict.keys():
-        new_module_name = None
-        # have to handle gamma, beta, and alpha differently due to their use
-        # in multiple modules within the original repository;
-        # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights
-        # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here
-        if "beta" in module_name:
-            # EMA sub-layers were always called "move" in the original repo
-            if "move.beta" in module_name:
-                new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
-            elif "mega_layer.beta" in module_name:
-                new_module_name = module_name.replace("beta", "qk_bias")
-            else:
-                new_module_name = module_name.replace("beta", "b_param")
-        # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights
-        elif "gamma" in module_name:
-            if "move.gamma" in module_name:
-                new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
-            elif "mega_layer.gamma" in module_name:
-                new_module_name = module_name.replace("gamma", "qk_weight")
-            else:
-                new_module_name = module_name.replace("gamma", "g_param")
-        # alpha is used in EMA and positional bias; renaming to improve readability
-        elif "move.alpha" in module_name:
-            new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
-        # delta is only used in EMA; renaming to improve readability
-        elif "move.delta" in module_name:
-            new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
-        # omega is only used in EMA; renaming to improve readability
-        elif "omega" in module_name:
-            new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")
-
-        if new_module_name:
-            updated_keys[module_name] = new_module_name
-
-    if len(updated_keys) != 0:
-        print(f"Renaming these keys: {updated_keys.keys()}")
-    else:
-        print("No need to rename state dict entries")
-    for old, new in updated_keys.items():
-        original_state_dict[new] = original_state_dict.pop(old)
-
-    # now attempt to load the state dictionary with updated names
-    # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style
-    print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))
-
-    # load the MLM head weights directly
-    print(
-        "HF Mega MLM layer:",
-        hf_mlm.mlm_head.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
-        ),
-    )
-
-    # test on a randomly generated input sequence
-    input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
-    input_mask = torch.ones_like(input_ids)
-    # mask a few tokens to make sure masking is applied appropriately :)
-    input_mask[:, -10:] = 0
-
-    # run forward passes
-    original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
-    hf_output = hf_mlm(input_ids, input_mask)[0]
-
-    # print shapes and diff
-    print(f"original output {original_output.shape}")
-    print(f"hf output {hf_output.shape}")
-    print(f"max diff: {(original_output - hf_output).max()}")  # 0.0
-    success = torch.allclose(original_output, hf_output, atol=1e-3)
-
-    if success:
-        print("Yay!")
-        hf_mlm.save_pretrained(output_path)
-    else:
-        raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")
-
-    if includes_tokenizer:
-        print("Transferring tokenizer")
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
-        tokenizer.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pretrained_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Point to the directory containing your model weights using the official Mega repo",
-    )
-
-    parser.add_argument(
-        "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
-    )
-
-    parser.add_argument(
-        "--includes_tokenizer",
-        action="store_true",
-        help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
-    )
-
-    args = parser.parse_args()
-
-    convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)
diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index da7f7806671d..000000000000
--- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TrajectoryTransformer pytorch checkpoint conversion"""
-
-import torch
-import trajectory.utils as utils
-
-from transformers import TrajectoryTransformerModel
-
-
-class Parser(utils.Parser):
-    dataset: str = "halfcheetah-medium-expert-v2"
-    config: str = "config.offline"
-
-
-def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device):
-    """Converting Sequential blocks to ModuleList"""
-
-    gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device)
-    trajectory_transformer = TrajectoryTransformerModel(gpt.config)
-
-    trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict())
-    trajectory_transformer.pos_emb = gpt.pos_emb
-    trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict())
-    trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict())
-    trajectory_transformer.head.load_state_dict(gpt.head.state_dict())
-
-    for i, block in enumerate(gpt.blocks):
-        trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict())
-        trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict())
-        trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict())
-
-        trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict())
-        trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict())
-        trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict())
-        trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict())
-
-    torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin")
-
-
-if __name__ == "__main__":
-    """
-    To run this script you will need to install the original repository to run the original model. You can find it
-    here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the
-    original pytorch checkpoints.
-
-    Run with the command:
-
-    ```sh
-    >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset <dataset_name>
-    ...     --gpt_loadpath <path_to_original_pytorch_checkpoint>
-    ```
-    """
-
-    args = Parser().parse_args("plan")
-    convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(
-        args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device
-    )
diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 2c7b687c4d98..000000000000
--- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Transformer XL checkpoint and datasets."""
-
-import argparse
-import os
-import pickle
-import sys
-
-import torch
-
-from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
-from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils
-from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-# We do this to be able to load python 2 datasets pickles
-# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
-data_utils.Vocab = data_utils.TransfoXLTokenizer
-data_utils.Corpus = data_utils.TransfoXLCorpus
-sys.modules["data_utils"] = data_utils
-sys.modules["vocabulary"] = data_utils
-
-
-def convert_transfo_xl_checkpoint_to_pytorch(
-    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
-):
-    if transfo_xl_dataset_file:
-        # Convert a pre-processed corpus (see original TensorFlow repo)
-        with open(transfo_xl_dataset_file, "rb") as fp:
-            corpus = pickle.load(fp, encoding="latin1")
-        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
-        print(f"Save vocabulary to {pytorch_vocab_dump_path}")
-        corpus_vocab_dict = corpus.vocab.__dict__
-        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
-
-        corpus_dict_no_vocab = corpus.__dict__
-        corpus_dict_no_vocab.pop("vocab", None)
-        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
-        print(f"Save dataset to {pytorch_dataset_dump_path}")
-        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
-
-    if tf_checkpoint_path:
-        # Convert a pre-trained TensorFlow model
-        config_path = os.path.abspath(transfo_xl_config_file)
-        tf_path = os.path.abspath(tf_checkpoint_path)
-
-        print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.")
-        # Initialise PyTorch model
-        if transfo_xl_config_file == "":
-            config = TransfoXLConfig()
-        else:
-            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
-        print(f"Building PyTorch model from configuration: {config}")
-        model = TransfoXLLMHeadModel(config)
-
-        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
-        # Save pytorch-model
-        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-        print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
-        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--tf_checkpoint_path",
-        default="",
-        type=str,
-        help="An optional path to a TensorFlow checkpoint path to be converted.",
-    )
-    parser.add_argument(
-        "--transfo_xl_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--transfo_xl_dataset_file",
-        default="",
-        type=str,
-        help="An optional dataset file to be converted in a vocabulary.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    args = parser.parse_args()
-    convert_transfo_xl_checkpoint_to_pytorch(
-        args.tf_checkpoint_path,
-        args.transfo_xl_config_file,
-        args.pytorch_dump_folder_path,
-        args.transfo_xl_dataset_file,
-    )
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
deleted file mode 100644
index 51466e77bae0..000000000000
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# coding=utf-8
-# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VAN checkpoints from the original repository.
-
-URL: https://github.com/Visual-Attention-Network/VAN-Classification"""
-
-import argparse
-import json
-import sys
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import List
-
-import torch
-import torch.nn as nn
-from huggingface_hub import cached_download, hf_hub_download
-from torch import Tensor
-
-from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
-from transformers.models.deprecated.van.modeling_van import VanLayerScaling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            if not isinstance(m, VanLayerScaling):
-                self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 0
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced):
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module:
-    # nn.Parameter cannot be tracked by the Tracker, thus we need to manually convert them
-    from_state_dict = from_model.state_dict()
-    our_state_dict = our_model.state_dict()
-    config = our_model.config
-    all_keys = []
-    for stage_idx in range(len(config.hidden_sizes)):
-        for block_id in range(config.depths[stage_idx]):
-            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1"
-            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight"
-
-            all_keys.append((from_key, to_key))
-            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2"
-            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight"
-
-            all_keys.append((from_key, to_key))
-
-    for from_key, to_key in all_keys:
-        our_state_dict[to_key] = from_state_dict.pop(from_key)
-
-    our_model.load_state_dict(our_state_dict)
-    return our_model
-
-
-def convert_weight_and_push(
-    name: str,
-    config: VanConfig,
-    checkpoint: str,
-    from_model: nn.Module,
-    save_directory: Path,
-    push_to_hub: bool = True,
-):
-    print(f"Downloading weights for {name}...")
-    checkpoint_path = cached_download(checkpoint)
-    print(f"Converting {name}...")
-    from_state_dict = torch.load(checkpoint_path)["state_dict"]
-    from_model.load_state_dict(from_state_dict)
-    from_model.eval()
-    with torch.no_grad():
-        our_model = VanForImageClassification(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-        our_model = copy_parameters(from_model, our_model)
-
-    if not torch.allclose(from_model(x), our_model(x).logits):
-        raise ValueError("The model logits don't match the original one.")
-
-    checkpoint_name = name
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "van-tiny": ImageNetPreTrainedConfig(
-            hidden_sizes=[32, 64, 160, 256],
-            depths=[3, 3, 5, 2],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-small": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[2, 2, 4, 2],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-base": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[3, 3, 12, 3],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-large": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[3, 5, 27, 3],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-    }
-
-    names_to_original_models = {
-        "van-tiny": van_tiny,
-        "van-small": van_small,
-        "van-base": van_base,
-        "van-large": van_large,
-    }
-
-    names_to_original_checkpoints = {
-        "van-tiny": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar"
-        ),
-        "van-small": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar"
-        ),
-        "van-base": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar"
-        ),
-        "van-large": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar"
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(
-            model_name,
-            names_to_config[model_name],
-            checkpoint=names_to_original_checkpoints[model_name],
-            from_model=names_to_original_models[model_name](),
-            save_directory=save_directory,
-            push_to_hub=push_to_hub,
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(
-                model_name,
-                config,
-                checkpoint=names_to_original_checkpoints[model_name],
-                from_model=names_to_original_models[model_name](),
-                save_directory=save_directory,
-                push_to_hub=push_to_hub,
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model-name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
-            " currently: van-tiny/small/base/large. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--van_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to VAN's original implementation directory. You can download from here:"
-            " https://github.com/Visual-Attention-Network/VAN-Classification"
-        ),
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    van_dir = args.van_dir
-    # append the path to the parents to maskformer dir
-    sys.path.append(str(van_dir.parent))
-    from van.models.van import van_base, van_large, van_small, van_tiny
-
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
deleted file mode 100644
index 1d717d74c961..000000000000
--- a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT hybrid checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm.data import resolve_data_config
-from timm.data.transforms_factory import create_transform
-
-from transformers import (
-    BitConfig,
-    ViTHybridConfig,
-    ViTHybridForImageClassification,
-    ViTHybridImageProcessor,
-    ViTHybridModel,
-)
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-
-    # fmt: off
-    # stem:
-    rename_keys.append(("cls_token", "vit.embeddings.cls_token"))
-    rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings"))
-
-    rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"))
-
-    # backbone
-    rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias"))
-
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias"))
-
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
-
-    # transformer encoder
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-    # fmt: on
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT hybrid configuration
-    backbone_config = BitConfig(
-        global_padding="same",
-        layer_type="bottleneck",
-        depths=(3, 4, 9),
-        out_features=["stage3"],
-        embedding_dynamic_padding=True,
-    )
-    config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
-    base_model = False
-
-    # load original model from timm
-    timm_model = timm.create_model(vit_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load HuggingFace model
-    if vit_name[-5:] == "in21k":
-        model = ViTHybridModel(config).eval()
-    else:
-        model = ViTHybridForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # create image processor
-    transform = create_transform(**resolve_data_config({}, model=timm_model))
-    timm_transforms = transform.transforms
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    processor = ViTHybridImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": timm_transforms[0].size},
-        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
-        do_center_crop=True,
-        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
-        do_normalize=True,
-        image_mean=timm_transforms[-1].mean.tolist(),
-        image_std=timm_transforms[-1].std.tolist(),
-    )
-
-    image = prepare_img()
-    timm_pixel_values = transform(image).unsqueeze(0)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # verify pixel values
-    assert torch.allclose(timm_pixel_values, pixel_values)
-
-    # verify logits
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print("Predicted class:", logits.argmax(-1).item())
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.pooler_output.shape
-        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor to the hub {vit_name}")
-        model.push_to_hub(f"ybelkada/{vit_name}")
-        processor.push_to_hub(f"ybelkada/{vit_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--vit_name",
-        default="vit_base_r50_s16_384",
-        type=str,
-        help="Name of the hybrid ViT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub."
-    )
-
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
deleted file mode 100644
index 5c6da13ae885..000000000000
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Depth Anything checkpoints from the original repository. URL:
-https://github.com/LiheYoung/Depth-Anything"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "small" in model_name:
-        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 64
-        neck_hidden_sizes = [48, 96, 192, 384]
-    elif "base" in model_name:
-        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 128
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif "large" in model_name:
-        out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 256
-        neck_hidden_sizes = [256, 512, 1024, 1024]
-    else:
-        raise NotImplementedError(f"Model not supported: {model_name}")
-
-    if "metric" in model_name:
-        depth_estimation_type = "metric"
-        max_depth = 20 if "indoor" in model_name else 80
-    else:
-        depth_estimation_type = "relative"
-        max_depth = None
-
-    config = DepthAnythingConfig(
-        reassemble_hidden_size=backbone_config.hidden_size,
-        patch_size=backbone_config.patch_size,
-        backbone_config=backbone_config,
-        fusion_hidden_size=fusion_hidden_size,
-        neck_hidden_sizes=neck_hidden_sizes,
-        depth_estimation_type=depth_estimation_type,
-        max_depth=max_depth,
-    )
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token"))
-    rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-
-    # Head
-    rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight"))
-    rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias"))
-
-    # activation postprocessing (readout projections + resize blocks)
-    # Depth Anything does not use CLS token => readout_projects not required
-
-    for i in range(4):
-        rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        if i != 2:
-            rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias"))
-    rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias"))
-    rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias"))
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-name_to_checkpoint = {
-    "depth-anything-small": "pytorch_model.bin",
-    "depth-anything-base": "pytorch_model.bin",
-    "depth-anything-large": "pytorch_model.bin",
-    "depth-anything-v2-small": "depth_anything_v2_vits.pth",
-    "depth-anything-v2-base": "depth_anything_v2_vitb.pth",
-    "depth-anything-v2-large": "depth_anything_v2_vitl.pth",
-    "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth",
-    "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth",
-    "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth",
-    "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth",
-    "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth",
-    "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth",
-    # v2-giant pending
-}
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration
-    config = get_dpt_config(model_name)
-
-    model_name_to_repo = {
-        "depth-anything-small": "LiheYoung/depth_anything_vits14",
-        "depth-anything-base": "LiheYoung/depth_anything_vitb14",
-        "depth-anything-large": "LiheYoung/depth_anything_vitl14",
-        "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
-        "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
-        "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
-        "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small",
-        "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base",
-        "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large",
-        "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small",
-        "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base",
-        "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large",
-    }
-
-    # load original state_dict
-    repo_id = model_name_to_repo[model_name]
-    filename = name_to_checkpoint[model_name]
-    filepath = hf_hub_download(
-        repo_id=repo_id,
-        filename=f"{filename}",
-    )
-
-    state_dict = torch.load(filepath, map_location="cpu")
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DepthAnythingForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    processor = DPTImageProcessor(
-        do_resize=True,
-        size={"height": 518, "width": 518},
-        ensure_multiple_of=14,
-        keep_aspect_ratio=True,
-        do_rescale=True,
-        do_normalize=True,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
-    )
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # Verify forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    if verify_logits:
-        expected_shape = torch.Size([1, 518, 686])
-        if model_name == "depth-anything-small":
-            expected_slice = torch.tensor(
-                [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
-            )
-        elif model_name == "depth-anything-base":
-            expected_slice = torch.tensor(
-                [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]],
-            )
-        elif model_name == "depth-anything-large":
-            expected_slice = torch.tensor(
-                [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]]
-            )
-        elif model_name == "depth-anything-v2-small":
-            expected_slice = torch.tensor(
-                [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]]
-            )
-        elif model_name == "depth-anything-v2-base":
-            expected_slice = torch.tensor(
-                [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]]
-            )
-        elif model_name == "depth-anything-v2-large":
-            expected_slice = torch.tensor(
-                [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-small":
-            expected_slice = torch.tensor(
-                [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-base":
-            expected_slice = torch.tensor(
-                [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-large":
-            expected_slice = torch.tensor(
-                [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-small":
-            expected_slice = torch.tensor(
-                [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-base":
-            expected_slice = torch.tensor(
-                [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-large":
-            expected_slice = torch.tensor(
-                [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]]
-            )
-        else:
-            raise ValueError("Not supported")
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"{model_name.title()}-hf")
-        processor.push_to_hub(repo_id=f"{model_name.title()}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="depth-anything-small",
-        type=str,
-        choices=name_to_checkpoint.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        required=False,
-        help="Whether to verify the logits after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index ba985145014c..000000000000
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETR checkpoints with timm backbone."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    # load default config
-    config = DetrConfig()
-    # set backbone and dilation attributes
-    if "resnet101" in model_name:
-        config.backbone = "resnet101"
-    if "dc5" in model_name:
-        config.dilation = True
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    image_processor = DetrImageProcessor(format=format)
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original model from torch hub
-    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
-    state_dict = detr.state_dict()
-    # rename keys
-    for src, dest in rename_keys:
-        if is_panoptic:
-            src = "detr." + src
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-    # verify our conversion
-    original_outputs = detr(pixel_values)
-    outputs = model(pixel_values)
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
deleted file mode 100644
index 6ba6a0e2920a..000000000000
--- a/src/transformers/models/detr/convert_detr_to_pytorch.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETR checkpoints with native (Transformers) backbone."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_detr_config(model_name):
-    # initialize config
-    if "resnet-50" in model_name:
-        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
-    elif "resnet-101" in model_name:
-        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
-    else:
-        raise ValueError("Model name should include either resnet50 or resnet101")
-
-    config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
-
-    # set label attributes
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    return config, is_panoptic
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
-                f"encoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("input_proj.weight", "input_projection.weight"),
-            ("input_proj.bias", "input_projection.bias"),
-            ("query_embed.weight", "query_position_embeddings.weight"),
-            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-            ("class_embed.weight", "class_labels_classifier.weight"),
-            ("class_embed.bias", "class_labels_classifier.bias"),
-            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    # load default config
-    config, is_panoptic = get_detr_config(model_name)
-
-    # load original model from torch hub
-    model_name_to_original_name = {
-        "detr-resnet-50": "detr_resnet50",
-        "detr-resnet-101": "detr_resnet101",
-    }
-    logger.info(f"Converting model {model_name}...")
-    detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
-    state_dict = detr.state_dict()
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        if is_panoptic:
-            src = "detr." + src
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion on an image
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    processor = DetrImageProcessor(format=format)
-
-    encoding = processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    original_outputs = detr(pixel_values)
-    outputs = model(pixel_values)
-
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Upload model and image processor to the hub
-        logger.info("Uploading PyTorch model and image processor to the hub...")
-        model.push_to_hub(f"nielsr/{model_name}")
-        processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="detr-resnet-50",
-        type=str,
-        choices=["detr-resnet-50", "detr-resnet-101"],
-        help="Name of the DETR model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
-    args = parser.parse_args()
-    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index fbf34012924b..000000000000
--- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers.utils import WEIGHTS_NAME
-
-
-DIALOGPT_MODELS = ["small", "medium", "large"]
-
-OLD_KEY = "lm_head.decoder.weight"
-NEW_KEY = "lm_head.weight"
-
-
-def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
-    d = torch.load(checkpoint_path)
-    d[NEW_KEY] = d.pop(OLD_KEY)
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dialogpt_path", default=".", type=str)
-    args = parser.parse_args()
-    for MODEL in DIALOGPT_MODELS:
-        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
-        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
-        convert_dialogpt_checkpoint(
-            checkpoint_path,
-            pytorch_dump_folder_path,
-        )
diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
deleted file mode 100644
index d716191b2fcb..000000000000
--- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/dinov2/tree/main
-"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dinov2_config(model_name, image_classifier=False):
-    config = Dinov2Config(image_size=518, patch_size=14)
-
-    # size of the architecture
-    if "vits" in model_name:
-        config.hidden_size = 384
-        config.num_attention_heads = 6
-    elif "vitb" in model_name:
-        pass
-    elif "vitl" in model_name:
-        config.hidden_size = 1024
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "vitg" in model_name:
-        config.use_swiglu_ffn = True
-        config.hidden_size = 1536
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 24
-    else:
-        raise ValueError("Model not supported")
-
-    if image_classifier:
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        config.num_labels = 1000
-        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        config.id2label = {int(k): v for k, v in config.id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("mask_token", "embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DINOv2 structure.
-    """
-
-    # define default Dinov2 configuration
-    image_classifier = "1layer" in model_name
-    config = get_dinov2_config(model_name, image_classifier=image_classifier)
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2ForImageClassification(config).eval()
-        model.dinov2.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
-            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
-            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
-            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2Model(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
-
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
-
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14": "dinov2-small",
-            "dinov2_vitb14": "dinov2-base",
-            "dinov2_vitl14": "dinov2-large",
-            "dinov2_vitg14": "dinov2-giant",
-            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
-            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
-            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
-            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"facebook/{name}")
-        processor.push_to_hub(f"facebook/{name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dinov2_vitb14",
-        type=str,
-        choices=[
-            "dinov2_vits14",
-            "dinov2_vitb14",
-            "dinov2_vitl14",
-            "dinov2_vitg14",
-            "dinov2_vits14_1layer",
-            "dinov2_vitb14_1layer",
-            "dinov2_vitl14_1layer",
-            "dinov2_vitg14_1layer",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
deleted file mode 100644
index 0ff2697f7466..000000000000
--- a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 with Registers checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/dinov2/tree/main
-"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import (
-    BitImageProcessor,
-    Dinov2WithRegistersConfig,
-    Dinov2WithRegistersForImageClassification,
-    Dinov2WithRegistersModel,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dinov2_with_registers_config(model_name, image_classifier=False):
-    config = Dinov2WithRegistersConfig(image_size=518, patch_size=14)
-
-    # size of the architecture
-    if "vits" in model_name:
-        config.hidden_size = 384
-        config.num_attention_heads = 6
-    elif "vitb" in model_name:
-        pass
-    elif "vitl" in model_name:
-        config.hidden_size = 1024
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "vitg" in model_name:
-        config.use_swiglu_ffn = True
-        config.hidden_size = 1536
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 24
-    else:
-        raise ValueError("Model not supported")
-
-    if image_classifier:
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        config.num_labels = 1000
-        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        config.id2label = {int(k): v for k, v in config.id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("mask_token", "embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("register_tokens", "embeddings.register_tokens"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our Dinov2WithRegisters structure.
-    """
-
-    # define default Dinov2WithRegisters configuration
-    image_classifier = "1layer" in model_name
-    config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier)
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2WithRegistersForImageClassification(config).eval()
-        model.dinov2_with_registers.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth",
-            "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth",
-            "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth",
-            "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2WithRegistersModel(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
-
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
-
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14_reg": "dinov2-with-registers-small",
-            "dinov2_vitb14_reg": "dinov2-with-registers-base",
-            "dinov2_vitl14_reg": "dinov2-with-registers-large",
-            "dinov2_vitg14_reg": "dinov2-with-registers-giant",
-            "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer",
-            "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer",
-            "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer",
-            "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"nielsr/{name}")
-        processor.push_to_hub(f"nielsr/{name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dinov2_vits14_reg",
-        type=str,
-        choices=[
-            "dinov2_vits14_reg",
-            "dinov2_vitb14_reg",
-            "dinov2_vitl14_reg",
-            "dinov2_vitg14_reg",
-            "dinov2_vits14_reg_1layer",
-            "dinov2_vitb14_reg_1layer",
-            "dinov2_vitl14_reg_1layer",
-            "dinov2_vitg14_reg_1layer",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
deleted file mode 100644
index 40c5b22e3b9a..000000000000
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DiT checkpoints from the unilm repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, has_lm_head=False, is_semantic=False):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", "beit.embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),
-            (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", "beit.embeddings.mask_token"),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", "beit.pooler.layernorm.weight"),
-                ("fc_norm.bias", "beit.pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our BEiT structure.
-    """
-
-    # define default BEiT configuration
-    has_lm_head = False if "rvlcdip" in checkpoint_url else True
-    config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head)
-
-    # size of the architecture
-    if "large" in checkpoint_url or "dit-l" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # labels
-    if "rvlcdip" in checkpoint_url:
-        config.num_labels = 16
-        repo_id = "huggingface/label-files"
-        filename = "rvlcdip-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    rename_keys = create_rename_keys(config, has_lm_head=has_lm_head)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head)
-
-    # load HuggingFace model
-    model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    image_processor = BeitImageProcessor(
-        size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
-    )
-    image = prepare_img()
-
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # verify logits
-    expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192]
-    assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected"
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        if has_lm_head:
-            model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
-        else:
-            model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
deleted file mode 100644
index f6f14f6d08e3..000000000000
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
-
-import argparse
-
-import torch
-from datasets import load_dataset
-from donut import DonutModel
-
-from transformers import (
-    DonutImageProcessor,
-    DonutProcessor,
-    DonutSwinConfig,
-    DonutSwinModel,
-    MBartConfig,
-    MBartForCausalLM,
-    VisionEncoderDecoderModel,
-    XLMRobertaTokenizerFast,
-)
-
-
-def get_configs(model):
-    original_config = model.config
-
-    encoder_config = DonutSwinConfig(
-        image_size=original_config.input_size,
-        patch_size=4,
-        depths=original_config.encoder_layer,
-        num_heads=[4, 8, 16, 32],
-        window_size=original_config.window_size,
-        embed_dim=128,
-    )
-    decoder_config = MBartConfig(
-        is_decoder=True,
-        is_encoder_decoder=False,
-        add_cross_attention=True,
-        decoder_layers=original_config.decoder_layer,
-        max_position_embeddings=original_config.max_position_embeddings,
-        vocab_size=len(
-            model.decoder.tokenizer
-        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
-        scale_embedding=True,
-        add_final_layer_norm=True,
-    )
-
-    return encoder_config, decoder_config
-
-
-def rename_key(name):
-    if "encoder.model" in name:
-        name = name.replace("encoder.model", "encoder")
-    if "decoder.model" in name:
-        name = name.replace("decoder.model", "decoder")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if name.startswith("encoder"):
-        if "layers" in name:
-            name = "encoder." + name
-        if "attn.proj" in name:
-            name = name.replace("attn.proj", "attention.output.dense")
-        if "attn" in name and "mask" not in name:
-            name = name.replace("attn", "attention.self")
-        if "norm1" in name:
-            name = name.replace("norm1", "layernorm_before")
-        if "norm2" in name:
-            name = name.replace("norm2", "layernorm_after")
-        if "mlp.fc1" in name:
-            name = name.replace("mlp.fc1", "intermediate.dense")
-        if "mlp.fc2" in name:
-            name = name.replace("mlp.fc2", "output.dense")
-
-        if name == "encoder.norm.weight":
-            name = "encoder.layernorm.weight"
-        if name == "encoder.norm.bias":
-            name = "encoder.layernorm.bias"
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            block_num = int(key_split[5])
-            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
-            # HuggingFace implementation doesn't use attn_mask buffer
-            # and model doesn't use final LayerNorms for the encoder
-            pass
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    original_model = DonutModel.from_pretrained(model_name).eval()
-
-    # load HuggingFace model
-    encoder_config, decoder_config = get_configs(original_model)
-    encoder = DonutSwinModel(encoder_config)
-    decoder = MBartForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results on scanned document
-    dataset = load_dataset("hf-internal-testing/example-documents")  # no-script
-    image = dataset["test"][0]["image"].convert("RGB")
-
-    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    image_processor = DonutImageProcessor(
-        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
-    )
-    processor = DonutProcessor(image_processor, tokenizer)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
-        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
-        question = "When is the coffee break?"
-        task_prompt = task_prompt.replace("{user_input}", question)
-    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
-        task_prompt = "<s_rvlcdip>"
-    elif model_name in [
-        "naver-clova-ix/donut-base-finetuned-cord-v1",
-        "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
-    ]:
-        task_prompt = "<s_cord>"
-    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
-        task_prompt = "s_cord-v2>"
-    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
-        task_prompt = "<s_zhtrainticket>"
-    elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
-        # use a random prompt
-        task_prompt = "hello world"
-    else:
-        raise ValueError("Model name not supported")
-    prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[
-        "input_ids"
-    ]
-
-    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
-    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
-    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)
-
-    # verify encoder hidden states
-    original_last_hidden_state = original_model.encoder(pixel_values)
-    last_hidden_state = model.encoder(pixel_values).last_hidden_state
-    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
-
-    # verify decoder hidden states
-    original_logits = original_model(pixel_values, prompt_tensors, None).logits
-    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
-    assert torch.allclose(original_logits, logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
-        processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="naver-clova-ix/donut-base-finetuned-docvqa",
-        required=False,
-        type=str,
-        help="Name of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
deleted file mode 100644
index c11345d1eb4e..000000000000
--- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import collections
-from pathlib import Path
-
-import torch
-from torch.serialization import default_restore_location
-
-from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader
-
-
-CheckpointState = collections.namedtuple(
-    "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"]
-)
-
-
-def load_states_from_checkpoint(model_file: str) -> CheckpointState:
-    print(f"Reading saved model from {model_file}")
-    state_dict = torch.load(model_file, map_location=lambda s, l: default_restore_location(s, "cpu"))
-    return CheckpointState(**state_dict)
-
-
-class DPRState:
-    def __init__(self, src_file: Path):
-        self.src_file = src_file
-
-    def load_dpr_model(self):
-        raise NotImplementedError
-
-    @staticmethod
-    def from_type(comp_type: str, *args, **kwargs) -> "DPRState":
-        if comp_type.startswith("c"):
-            return DPRContextEncoderState(*args, **kwargs)
-        if comp_type.startswith("q"):
-            return DPRQuestionEncoderState(*args, **kwargs)
-        if comp_type.startswith("r"):
-            return DPRReaderState(*args, **kwargs)
-        else:
-            raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.")
-
-
-class DPRContextEncoderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR biencoder from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        encoder, prefix = model.ctx_encoder, "ctx_model."
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids}
-        for key, value in saved_state.model_dict.items():
-            if key.startswith(prefix):
-                key = key[len(prefix) :]
-                if not key.startswith("encode_proj."):
-                    key = "bert_model." + key
-                state_dict[key] = value
-        encoder.load_state_dict(state_dict)
-        return model
-
-
-class DPRQuestionEncoderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR biencoder from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        encoder, prefix = model.question_encoder, "question_model."
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids}
-        for key, value in saved_state.model_dict.items():
-            if key.startswith(prefix):
-                key = key[len(prefix) :]
-                if not key.startswith("encode_proj."):
-                    key = "bert_model." + key
-                state_dict[key] = value
-        encoder.load_state_dict(state_dict)
-        return model
-
-
-class DPRReaderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR reader from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {
-            "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids
-        }
-        for key, value in saved_state.model_dict.items():
-            if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"):
-                key = "encoder.bert_model." + key[len("encoder.") :]
-            state_dict[key] = value
-        model.span_predictor.load_state_dict(state_dict)
-        return model
-
-
-def convert(comp_type: str, src_file: Path, dest_dir: Path):
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-
-    dpr_state = DPRState.from_type(comp_type, src_file=src_file)
-    model = dpr_state.load_dpr_model()
-    model.save_pretrained(dest_dir)
-    model.from_pretrained(dest_dir)  # sanity check
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
-    )
-    parser.add_argument(
-        "--src",
-        type=str,
-        help=(
-            "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo"
-            " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the"
-            " 'retriever' checkpoints."
-        ),
-    )
-    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.")
-    args = parser.parse_args()
-
-    src_file = Path(args.src)
-    dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
-    dest_dir = Path(dest_dir)
-    assert src_file.exists()
-    assert (
-        args.type is not None
-    ), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
-    convert(args.type, src_file, dest_dir)
diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
deleted file mode 100644
index 367aff7f90e1..000000000000
--- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 + DPT checkpoints from the original repository. URL:
-https://github.com/facebookresearch/dinov2/tree/main"""
-
-import argparse
-import itertools
-import math
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-from torchvision import transforms
-
-from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "small" in model_name:
-        # equivalent to stage 3, stage 6, stage 9, stage 12
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [48, 96, 192, 384]
-    elif "base" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif "large" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [128, 256, 512, 1024]
-    elif "giant" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [192, 384, 768, 1536]
-    else:
-        raise NotImplementedError("To do")
-
-    config = DPTConfig(
-        backbone_config=backbone_config,
-        neck_hidden_sizes=neck_hidden_sizes,
-        use_bias_in_fusion_residual=False,
-        add_projection=True,
-    )
-
-    return config
-
-
-# here we list all DPT keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys_dpt(config):
-    rename_keys = []
-
-    # fmt: off
-    # activation postprocessing (projections, readout projections + resize blocks)
-    for i in range(4):
-        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
-        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
-
-        if i != 2:
-            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # fusion layers
-    for i in range(4):
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias"))
-        if i != 0:
-            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"))
-            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"))
-
-    # neck convolutions
-    for i in range(4):
-        rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight"))
-    rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias"))
-
-    for i in range(0, 5, 2):
-        rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-# here we list all backbone keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys_backbone(config):
-    rename_keys = []
-
-    # fmt: off
-    # patch embedding layer
-    rename_keys.append(("cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("mask_token", "backbone.embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.backbone_config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-    # fmt: on
-
-    rename_keys.append(("norm.weight", "backbone.layernorm.weight"))
-    rename_keys.append(("norm.bias", "backbone.layernorm.bias"))
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        hidden_size = config.backbone_config.hidden_size
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-name_to_url = {
-    "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth",
-    "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth",
-    "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth",
-    "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth",
-    "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth",
-    "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth",
-    "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth",
-    "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth",
-}
-
-
-def get_original_pixel_values(image):
-    class CenterPadding:
-        def __init__(self, multiple):
-            super().__init__()
-            self.multiple = multiple
-
-        def _get_pad(self, size):
-            new_size = math.ceil(size / self.multiple) * self.multiple
-            pad_size = new_size - size
-            pad_size_left = pad_size // 2
-            pad_size_right = pad_size - pad_size_left
-            return pad_size_left, pad_size_right
-
-        def __call__(self, img):
-            pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1]))
-            output = torch.nn.functional.pad(img, pads)
-            return output
-
-        def __repr__(self):
-            return self.__class__.__name__ + "()"
-
-    def make_depth_transform() -> transforms.Compose:
-        return transforms.Compose(
-            [
-                transforms.ToTensor(),
-                lambda x: 255.0 * x[:3],  # Discard alpha component and scale by 255
-                transforms.Normalize(
-                    mean=(123.675, 116.28, 103.53),
-                    std=(58.395, 57.12, 57.375),
-                ),
-                CenterPadding(multiple=14),
-            ]
-        )
-
-    transform = make_depth_transform()
-    original_pixel_values = transform(image).unsqueeze(0)
-
-    return original_pixel_values
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config = get_dpt_config(model_name)
-
-    # load original DPT state_dict from URL
-    print("URL:", checkpoint_url)
-    dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-    # rename keys
-    rename_keys = create_rename_keys_dpt(config)
-    for src, dest in rename_keys:
-        rename_key(dpt_state_dict, src, dest)
-
-    # load original backbone state_dict from URL
-    if "small" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
-    elif "base" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
-    elif "large" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14")
-    elif "giant" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14")
-    else:
-        raise NotImplementedError("To do")
-    original_model.eval()
-    backbone_state_dict = original_model.state_dict()
-
-    # rename keys
-    rename_keys = create_rename_keys_backbone(config)
-    for src, dest in rename_keys:
-        rename_key(backbone_state_dict, src, dest)
-
-    # read in qkv matrices
-    read_in_q_k_v(backbone_state_dict, config)
-
-    for key, val in backbone_state_dict.copy().items():
-        val = backbone_state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        backbone_state_dict[key] = val
-
-    # merge state_dicts
-    state_dict = {**backbone_state_dict, **dpt_state_dict}
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == [
-        "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight",
-        "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight",
-    ]
-    model.eval()
-
-    # Verify image processor
-    processor = DPTImageProcessor(
-        do_resize=False,
-        do_rescale=False,
-        do_pad=True,
-        size_divisor=14,
-        do_normalize=True,
-        image_mean=(123.675, 116.28, 103.53),
-        image_std=(58.395, 57.12, 57.375),
-    )
-
-    image = prepare_img()
-    pixel_values = processor(image, return_tensors="pt").pixel_values.float()
-    original_pixel_values = get_original_pixel_values(image)
-
-    assert torch.allclose(pixel_values, original_pixel_values)
-
-    # Verify forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    if verify_logits:
-        if model_name == "dpt-dinov2-small-nyu":
-            expected_shape = torch.Size([1, 576, 736])
-            expected_slice = torch.tensor(
-                [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]]
-            )
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"facebook/{model_name}")
-        processor.push_to_hub(repo_id=f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-dinov2-small-nyu",
-        type=str,
-        choices=name_to_url.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_true",
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
deleted file mode 100644
index 3a576d772f57..000000000000
--- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    hidden_size = 768
-    num_hidden_layers = 12
-    num_attention_heads = 12
-    intermediate_size = 3072
-    out_features = ["stage3", "stage6", "stage9", "stage12"]  # beit-base-384 uses [2, 5, 8, 11]
-
-    if "large" in model_name:
-        hidden_size = 1024
-        num_hidden_layers = 24
-        num_attention_heads = 16
-        intermediate_size = 4096
-        out_features = ["stage6", "stage12", "stage18", "stage24"]  # beit-large-512 uses [5, 11, 17, 23]
-
-    if "512" in model_name:
-        image_size = 512
-    elif "384" in model_name:
-        image_size = 384
-    else:
-        raise ValueError("Model not supported")
-
-    backbone_config = BeitConfig(
-        image_size=image_size,
-        num_hidden_layers=num_hidden_layers,
-        hidden_size=hidden_size,
-        intermediate_size=intermediate_size,
-        num_attention_heads=num_attention_heads,
-        use_relative_position_bias=True,
-        reshape_hidden_states=False,
-        out_features=out_features,
-    )
-
-    neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768]
-    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
-
-    return config, image_size
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"))
-
-    # activation postprocessing (readout projections + resize blocks)
-    for i in range(4):
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
-
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        if i != 2:
-            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    for i in range(0, 5, 2):
-        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
-
-    return rename_keys
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    name_to_url = {
-        "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
-        "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt",
-        "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt",
-    }
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config, image_size = get_dpt_config(model_name)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == []
-    # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"]
-    model.eval()
-
-    # Check outputs on an image
-    # We set `keep_aspect_ratio=False` as our current BEiT does not support arbitrary window sizes
-    processor = DPTImageProcessor(
-        size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32
-    )
-
-    image = prepare_img()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
-    print("Mean of pixel values:", pixel_values.mean().item())
-    print("Shape of pixel values:", pixel_values.shape)
-
-    import requests
-    from PIL import Image
-    from torchvision import transforms
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    transforms = transforms.Compose(
-        [
-            transforms.Resize((image_size, image_size)),
-            transforms.ToTensor(),
-        ]
-    )
-    pixel_values = transforms(image).unsqueeze(0)
-
-    # forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    # TODO there's still a small difference with the original logits
-    if model_name == "dpt-beit-large-512":
-        # OK, checked
-        expected_shape = torch.Size([1, 512, 512])
-        expected_slice = torch.tensor(
-            [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]]
-        )
-    elif model_name == "dpt-beit-large-384":
-        # OK, checked
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor(
-            [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]],
-        )
-    elif model_name == "dpt-beit-base-384":
-        # OK, checked
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor(
-            [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]],
-        )
-
-    assert predicted_depth.shape == torch.Size(expected_shape)
-    assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"nielsr/{model_name}")
-        processor.push_to_hub(repo_id=f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-beit-large-512",
-        type=str,
-        choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
deleted file mode 100644
index 16e4d71212b5..000000000000
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(checkpoint_url):
-    config = DPTConfig(embedding_type="hybrid")
-
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.backbone_out_indices = [5, 11, 17, 23]
-        config.neck_hidden_sizes = [256, 512, 1024, 1024]
-        expected_shape = (1, 384, 384)
-
-    if "nyu" in checkpoint_url or "midas" in checkpoint_url:
-        config.hidden_size = 768
-        config.reassemble_factors = [1, 1, 1, 0.5]
-        config.neck_hidden_sizes = [256, 512, 768, 768]
-        config.num_labels = 150
-        config.patch_size = 16
-        expected_shape = (1, 384, 384)
-        config.use_batch_norm_in_fusion_residual = False
-        config.readout_type = "project"
-
-    if "ade" in checkpoint_url:
-        config.use_batch_norm_in_fusion_residual = True
-        config.hidden_size = 768
-        config.reassemble_stage = [1, 1, 1, 0.5]
-        config.num_labels = 150
-        config.patch_size = 16
-        repo_id = "huggingface/label-files"
-        filename = "ade20k-id2label.json"
-        id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        expected_shape = [1, 150, 480, 480]
-
-    return config, expected_shape
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if (
-        "pretrained.model" in name
-        and "cls_token" not in name
-        and "pos_embed" not in name
-        and "patch_embed" not in name
-    ):
-        name = name.replace("pretrained.model", "dpt.encoder")
-    if "pretrained.model" in name:
-        name = name.replace("pretrained.model", "dpt.embeddings")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "position_embeddings")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "proj" in name and "project" not in name:
-        name = name.replace("proj", "projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layer")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm1" in name and "backbone" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name and "backbone" not in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "scratch.output_conv" in name:
-        name = name.replace("scratch.output_conv", "head")
-    if "scratch" in name:
-        name = name.replace("scratch", "neck")
-    if "layer1_rn" in name:
-        name = name.replace("layer1_rn", "convs.0")
-    if "layer2_rn" in name:
-        name = name.replace("layer2_rn", "convs.1")
-    if "layer3_rn" in name:
-        name = name.replace("layer3_rn", "convs.2")
-    if "layer4_rn" in name:
-        name = name.replace("layer4_rn", "convs.3")
-    if "refinenet" in name:
-        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
-        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-    if "conv2" in name:
-        name = name.replace("conv2", "convolution2")
-    # readout blocks
-    if "pretrained.act_postprocess1.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
-    if "pretrained.act_postprocess2.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
-    if "pretrained.act_postprocess3.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
-    if "pretrained.act_postprocess4.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
-
-    # resize blocks
-    if "pretrained.act_postprocess1.3" in name:
-        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "pretrained.act_postprocess1.4" in name:
-        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "pretrained.act_postprocess2.3" in name:
-        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "pretrained.act_postprocess2.4" in name:
-        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "pretrained.act_postprocess3.3" in name:
-        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "pretrained.act_postprocess4.3" in name:
-        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-    if "pretrained.act_postprocess4.4" in name:
-        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-    if "pretrained" in name:
-        name = name.replace("pretrained", "dpt")
-    if "bn" in name:
-        name = name.replace("bn", "batch_norm")
-    if "head" in name:
-        name = name.replace("head", "head.head")
-    if "encoder.norm" in name:
-        name = name.replace("encoder.norm", "layernorm")
-    if "auxlayer" in name:
-        name = name.replace("auxlayer", "auxiliary_head.head")
-    if "backbone" in name:
-        name = name.replace("backbone", "backbone.bit.encoder")
-
-    if ".." in name:
-        name = name.replace("..", ".")
-
-    if "stem.conv" in name:
-        name = name.replace("stem.conv", "bit.embedder.convolution")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "convolution" in name and "backbone" in name:
-        name = name.replace("convolution", "conv")
-    if "layer" in name and "backbone" in name:
-        name = name.replace("layer", "layers")
-    if "backbone.bit.encoder.bit" in name:
-        name = name.replace("backbone.bit.encoder.bit", "backbone.bit")
-    if "embedder.conv" in name:
-        name = name.replace("embedder.conv", "embedder.convolution")
-    if "backbone.bit.encoder.stem.norm" in name:
-        name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm")
-    return name
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    config, expected_shape = get_dpt_config(checkpoint_url)
-    # load original state_dict from URL
-    # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    state_dict = torch.load(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image
-    size = 480 if "ade" in checkpoint_url else 384
-    image_processor = DPTImageProcessor(size=size)
-
-    image = prepare_img()
-    encoding = image_processor(image, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
-
-    if show_prediction:
-        prediction = (
-            torch.nn.functional.interpolate(
-                outputs.unsqueeze(1),
-                size=(image.size[1], image.size[0]),
-                mode="bicubic",
-                align_corners=False,
-            )
-            .squeeze()
-            .cpu()
-            .numpy()
-        )
-
-        Image.fromarray((prediction / prediction.max()) * 255).show()
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("ybelkada/dpt-hybrid-midas")
-        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
-        type=str,
-        help="URL of the original DPT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--model_name",
-        default="dpt-large",
-        type=str,
-        help="Name of the model, in case you're pushing to the hub.",
-    )
-    parser.add_argument(
-        "--show_prediction",
-        action="store_true",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
-    )
diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
deleted file mode 100644
index 0feebe72d474..000000000000
--- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTImageProcessor, Swinv2Config
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "tiny" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        window_size = 16
-        # note: for Swinv2-tiny authors used the window_size = 16 variant
-        # as seen here: https://github.com/isl-org/MiDaS/blob/bdc4ed64c095e026dc0a2f17cabb14d58263decb/midas/backbones/swin2.py#L26
-        pretrained_window_sizes = (0, 0, 0, 0)
-    elif "base" in model_name:
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        window_size = 24
-        pretrained_window_sizes = (12, 12, 12, 6)
-    elif "large" in model_name:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-        window_size = 24
-        pretrained_window_sizes = (12, 12, 12, 6)
-
-    if "384" in model_name:
-        image_size = 384
-    elif "256" in model_name:
-        image_size = 256
-    else:
-        raise ValueError("Model not supported, to do")
-
-    backbone_config = Swinv2Config(
-        image_size=image_size,
-        embed_dim=embed_dim,
-        depths=depths,
-        window_size=window_size,
-        pretrained_window_sizes=pretrained_window_sizes,
-        num_heads=num_heads,
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-    )
-
-    if model_name == "dpt-swinv2-tiny-256":
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif model_name == "dpt-swinv2-base-384":
-        neck_hidden_sizes = [128, 256, 512, 1024]
-    elif model_name == "dpt-swinv2-large-384":
-        neck_hidden_sizes = [192, 384, 768, 1536]
-
-    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
-
-    return config, image_size
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
-
-    # transformer encoder
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.logit_scale", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.logit_scale"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.2.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.q_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.v_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-
-        # downsample parameters
-        if i in [0,1,2]:
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
-
-    # note: non-Transformer backbones like Swinv2, LeViT et al don't require activation postprocessing (readout projections + resize blocks)
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    for i in range(0, 5, 2):
-        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
-
-    return rename_keys
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, model):
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim:, :
-            ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    name_to_url = {
-        "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt",
-        "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt",
-        "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
-    }
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config, image_size = get_dpt_config(model_name)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config, model)
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    model.eval()
-
-    # Check outputs on an image
-    processor = DPTImageProcessor(size={"height": image_size, "width": image_size})
-
-    image = prepare_img()
-    processor(image, return_tensors="pt")
-
-    if verify_logits:
-        from torchvision import transforms
-
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-
-        transforms = transforms.Compose(
-            [
-                transforms.Resize((image_size, image_size)),
-                transforms.ToTensor(),
-            ]
-        )
-        pixel_values = transforms(image).unsqueeze(0)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(pixel_values)
-
-        predicted_depth = outputs.predicted_depth
-
-        print("Shape of predicted depth:", predicted_depth.shape)
-        print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-        # assert logits
-        if model_name == "dpt-swinv2-base-384":
-            # OK, checked
-            expected_shape = torch.Size([1, 384, 384])
-            expected_slice = torch.tensor(
-                [
-                    [1998.5575, 1997.3887, 2009.2981],
-                    [1952.8607, 1979.6488, 2001.0854],
-                    [1953.7697, 1961.7711, 1968.8904],
-                ],
-            )
-        elif model_name == "dpt-swinv2-tiny-256":
-            # OK, checked
-            expected_shape = torch.Size([1, 256, 256])
-            expected_slice = torch.tensor(
-                [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]],
-            )
-        elif model_name == "dpt-swinv2-large-384":
-            # OK, checked
-            expected_shape = torch.Size([1, 384, 384])
-            expected_slice = torch.tensor(
-                [
-                    [1203.7206, 1200.1495, 1197.8234],
-                    [1196.2484, 1183.5033, 1186.4640],
-                    [1178.8131, 1182.3260, 1174.3975],
-                ],
-            )
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"Intel/{model_name}")
-        processor.push_to_hub(repo_id=f"Intel/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-swinv2-base-384",
-        type=str,
-        choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_true",
-        help="Whether to verify logits after conversion.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
deleted file mode 100644
index 489da9acd19c..000000000000
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(checkpoint_url):
-    config = DPTConfig()
-
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.backbone_out_indices = [5, 11, 17, 23]
-        config.neck_hidden_sizes = [256, 512, 1024, 1024]
-        expected_shape = (1, 384, 384)
-
-    if "ade" in checkpoint_url:
-        config.use_batch_norm_in_fusion_residual = True
-
-        config.num_labels = 150
-        repo_id = "huggingface/label-files"
-        filename = "ade20k-id2label.json"
-        id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        expected_shape = [1, 150, 480, 480]
-
-    return config, expected_shape
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if (
-        "pretrained.model" in name
-        and "cls_token" not in name
-        and "pos_embed" not in name
-        and "patch_embed" not in name
-    ):
-        name = name.replace("pretrained.model", "dpt.encoder")
-    if "pretrained.model" in name:
-        name = name.replace("pretrained.model", "dpt.embeddings")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "patch_embeddings")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "position_embeddings")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "proj" in name and "project" not in name:
-        name = name.replace("proj", "projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layer")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "scratch.output_conv" in name:
-        name = name.replace("scratch.output_conv", "head")
-    if "scratch" in name:
-        name = name.replace("scratch", "neck")
-    if "layer1_rn" in name:
-        name = name.replace("layer1_rn", "convs.0")
-    if "layer2_rn" in name:
-        name = name.replace("layer2_rn", "convs.1")
-    if "layer3_rn" in name:
-        name = name.replace("layer3_rn", "convs.2")
-    if "layer4_rn" in name:
-        name = name.replace("layer4_rn", "convs.3")
-    if "refinenet" in name:
-        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
-        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-    if "conv2" in name:
-        name = name.replace("conv2", "convolution2")
-    # readout blocks
-    if "pretrained.act_postprocess1.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
-    if "pretrained.act_postprocess2.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
-    if "pretrained.act_postprocess3.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
-    if "pretrained.act_postprocess4.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
-    # resize blocks
-    if "pretrained.act_postprocess1.3" in name:
-        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "pretrained.act_postprocess1.4" in name:
-        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "pretrained.act_postprocess2.3" in name:
-        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "pretrained.act_postprocess2.4" in name:
-        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "pretrained.act_postprocess3.3" in name:
-        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "pretrained.act_postprocess4.3" in name:
-        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-    if "pretrained.act_postprocess4.4" in name:
-        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-    if "pretrained" in name:
-        name = name.replace("pretrained", "dpt")
-    if "bn" in name:
-        name = name.replace("bn", "batch_norm")
-    if "head" in name:
-        name = name.replace("head", "head.head")
-    if "encoder.norm" in name:
-        name = name.replace("encoder.norm", "layernorm")
-    if "auxlayer" in name:
-        name = name.replace("auxlayer", "auxiliary_head.head")
-
-    return name
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    config, expected_shape = get_dpt_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image
-    size = 480 if "ade" in checkpoint_url else 384
-    image_processor = DPTImageProcessor(size=size)
-
-    image = prepare_img()
-    encoding = image_processor(image, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
-
-    # Assert logits
-    expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]])
-    if "ade" in checkpoint_url:
-        expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]])
-    assert outputs.shape == torch.Size(expected_shape)
-    assert (
-        torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4)
-        if "ade" in checkpoint_url
-        else torch.allclose(outputs[0, :3, :3], expected_slice)
-    )
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model to hub...")
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
-        type=str,
-        help="URL of the original DPT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--model_name",
-        default="dpt-large",
-        type=str,
-        required=False,
-        help="Name of the model, in case you're pushing to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
deleted file mode 100644
index e9988524aca0..000000000000
--- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert EfficientNet checkpoints from the original repository.
-
-URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""
-
-import argparse
-import json
-import os
-
-import numpy as np
-import PIL
-import requests
-import tensorflow.keras.applications.efficientnet as efficientnet
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from tensorflow.keras.preprocessing import image
-
-from transformers import (
-    EfficientNetConfig,
-    EfficientNetForImageClassification,
-    EfficientNetImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-model_classes = {
-    "b0": efficientnet.EfficientNetB0,
-    "b1": efficientnet.EfficientNetB1,
-    "b2": efficientnet.EfficientNetB2,
-    "b3": efficientnet.EfficientNetB3,
-    "b4": efficientnet.EfficientNetB4,
-    "b5": efficientnet.EfficientNetB5,
-    "b6": efficientnet.EfficientNetB6,
-    "b7": efficientnet.EfficientNetB7,
-}
-
-CONFIG_MAP = {
-    "b0": {
-        "hidden_dim": 1280,
-        "width_coef": 1.0,
-        "depth_coef": 1.0,
-        "image_size": 224,
-        "dropout_rate": 0.2,
-        "dw_padding": [],
-    },
-    "b1": {
-        "hidden_dim": 1280,
-        "width_coef": 1.0,
-        "depth_coef": 1.1,
-        "image_size": 240,
-        "dropout_rate": 0.2,
-        "dw_padding": [16],
-    },
-    "b2": {
-        "hidden_dim": 1408,
-        "width_coef": 1.1,
-        "depth_coef": 1.2,
-        "image_size": 260,
-        "dropout_rate": 0.3,
-        "dw_padding": [5, 8, 16],
-    },
-    "b3": {
-        "hidden_dim": 1536,
-        "width_coef": 1.2,
-        "depth_coef": 1.4,
-        "image_size": 300,
-        "dropout_rate": 0.3,
-        "dw_padding": [5, 18],
-    },
-    "b4": {
-        "hidden_dim": 1792,
-        "width_coef": 1.4,
-        "depth_coef": 1.8,
-        "image_size": 380,
-        "dropout_rate": 0.4,
-        "dw_padding": [6],
-    },
-    "b5": {
-        "hidden_dim": 2048,
-        "width_coef": 1.6,
-        "depth_coef": 2.2,
-        "image_size": 456,
-        "dropout_rate": 0.4,
-        "dw_padding": [13, 27],
-    },
-    "b6": {
-        "hidden_dim": 2304,
-        "width_coef": 1.8,
-        "depth_coef": 2.6,
-        "image_size": 528,
-        "dropout_rate": 0.5,
-        "dw_padding": [31],
-    },
-    "b7": {
-        "hidden_dim": 2560,
-        "width_coef": 2.0,
-        "depth_coef": 3.1,
-        "image_size": 600,
-        "dropout_rate": 0.5,
-        "dw_padding": [18],
-    },
-}
-
-
-def get_efficientnet_config(model_name):
-    config = EfficientNetConfig()
-    config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
-    config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
-    config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
-    config.image_size = CONFIG_MAP[model_name]["image_size"]
-    config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
-    config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]
-
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    config.num_labels = 1000
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def convert_image_processor(model_name):
-    size = CONFIG_MAP[model_name]["image_size"]
-    preprocessor = EfficientNetImageProcessor(
-        size={"height": size, "width": size},
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.47853944, 0.4732864, 0.47434163],
-        do_center_crop=False,
-    )
-    return preprocessor
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def rename_keys(original_param_names):
-    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
-    block_names = sorted(set(block_names))
-    num_blocks = len(block_names)
-    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
-
-    rename_keys = []
-    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
-    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
-    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
-    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
-    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
-
-    for b in block_names:
-        hf_b = block_name_mapping[b]
-        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
-        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
-        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
-        )
-        rename_keys.append(
-            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
-        )
-        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
-        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
-        rename_keys.append(
-            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
-        )
-
-        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
-        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
-        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
-        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
-        rename_keys.append(
-            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
-        )
-        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
-        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
-        )
-
-    rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
-    rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
-    rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
-    rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
-    rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))
-
-    key_mapping = {}
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = "efficientnet." + item[1]
-
-    key_mapping["predictions/kernel:0"] = "classifier.weight"
-    key_mapping["predictions/bias:0"] = "classifier.bias"
-    return key_mapping
-
-
-def replace_params(hf_params, tf_params, key_mapping):
-    for key, value in tf_params.items():
-        if "normalization" in key:
-            continue
-
-        hf_key = key_mapping[key]
-        if "_conv" in key and "kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
-        elif "depthwise_kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
-        elif "kernel" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value))
-        else:
-            new_hf_value = torch.from_numpy(value)
-
-        # Replace HF parameters with original TF model parameters
-        assert hf_params[hf_key].shape == new_hf_value.shape
-        hf_params[hf_key].copy_(new_hf_value)
-
-
-@torch.no_grad()
-def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our EfficientNet structure.
-    """
-    # Load original model
-    original_model = model_classes[model_name](
-        include_top=True,
-        weights="imagenet",
-        input_tensor=None,
-        input_shape=None,
-        pooling=None,
-        classes=1000,
-        classifier_activation="softmax",
-    )
-
-    tf_params = original_model.trainable_variables
-    tf_non_train_params = original_model.non_trainable_variables
-    tf_params = {param.name: param.numpy() for param in tf_params}
-    for param in tf_non_train_params:
-        tf_params[param.name] = param.numpy()
-    tf_param_names = list(tf_params.keys())
-
-    # Load HuggingFace model
-    config = get_efficientnet_config(model_name)
-    hf_model = EfficientNetForImageClassification(config).eval()
-    hf_params = hf_model.state_dict()
-
-    # Create src-to-dst parameter name mapping dictionary
-    print("Converting parameters...")
-    key_mapping = rename_keys(tf_param_names)
-    replace_params(hf_params, tf_params, key_mapping)
-
-    # Initialize preprocessor and preprocess input image
-    preprocessor = convert_image_processor(model_name)
-    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
-
-    # HF model inference
-    hf_model.eval()
-    with torch.no_grad():
-        outputs = hf_model(**inputs)
-    hf_logits = outputs.logits.detach().numpy()
-
-    # Original model inference
-    original_model.trainable = False
-    image_size = CONFIG_MAP[model_name]["image_size"]
-    img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
-    x = image.img_to_array(img)
-    x = np.expand_dims(x, axis=0)
-    original_logits = original_model.predict(x)
-
-    # Check whether original and HF model outputs match  -> np.allclose
-    assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
-    print("Model outputs match!")
-
-    if save_model:
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and image processor
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model and image processor to hub
-        print(f"Pushing converted {model_name} to the hub...")
-        model_name = f"efficientnet-{model_name}"
-        preprocessor.push_to_hub(model_name)
-        hf_model.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="b0",
-        type=str,
-        help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="hf_model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-
-    args = parser.parse_args()
-    convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index b0abc30cd758..000000000000
--- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ELECTRA checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
-    # Initialise PyTorch model
-    config = ElectraConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-
-    if discriminator_or_generator == "discriminator":
-        model = ElectraForPreTraining(config)
-    elif discriminator_or_generator == "generator":
-        model = ElectraForMaskedLM(config)
-    else:
-        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_electra(
-        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
-    )
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--discriminator_or_generator",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
-            "'generator'."
-        ),
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
-    )
diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
deleted file mode 100644
index 4db97bd68836..000000000000
--- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert EnCodec checkpoints."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    EncodecConfig,
-    EncodecFeatureExtractor,
-    EncodecModel,
-    logging,
-)
-
-
-# checkpoints downloaded from:
-# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th
-# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin
-# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.encodec")
-
-MAPPING_QUANTIZER = {
-    "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited",
-    "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size",
-    "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed",
-    "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg",
-}
-MAPPING_ENCODER = {
-    "encoder.model.0.conv.conv": "encoder.layers.0.conv",
-    "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv",
-    "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv",
-    "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv",
-    "encoder.model.3.conv.conv": "encoder.layers.3.conv",
-    "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv",
-    "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv",
-    "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv",
-    "encoder.model.6.conv.conv": "encoder.layers.6.conv",
-    "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv",
-    "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv",
-    "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv",
-    "encoder.model.9.conv.conv": "encoder.layers.9.conv",
-    "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv",
-    "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv",
-    "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv",
-    "encoder.model.12.conv.conv": "encoder.layers.12.conv",
-    "encoder.model.13.lstm": "encoder.layers.13.lstm",
-    "encoder.model.15.conv.conv": "encoder.layers.15.conv",
-}
-MAPPING_ENCODER_48K = {
-    "encoder.model.0.conv.norm": "encoder.layers.0.norm",
-    "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm",
-    "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm",
-    "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm",
-    "encoder.model.3.conv.norm": "encoder.layers.3.norm",
-    "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm",
-    "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm",
-    "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm",
-    "encoder.model.6.conv.norm": "encoder.layers.6.norm",
-    "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm",
-    "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm",
-    "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm",
-    "encoder.model.9.conv.norm": "encoder.layers.9.norm",
-    "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm",
-    "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm",
-    "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm",
-    "encoder.model.12.conv.norm": "encoder.layers.12.norm",
-    "encoder.model.15.conv.norm": "encoder.layers.15.norm",
-}
-MAPPING_DECODER = {
-    "decoder.model.0.conv.conv": "decoder.layers.0.conv",
-    "decoder.model.1.lstm": "decoder.layers.1.lstm",
-    "decoder.model.3.convtr.convtr": "decoder.layers.3.conv",
-    "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv",
-    "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv",
-    "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv",
-    "decoder.model.6.convtr.convtr": "decoder.layers.6.conv",
-    "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv",
-    "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv",
-    "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv",
-    "decoder.model.9.convtr.convtr": "decoder.layers.9.conv",
-    "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv",
-    "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv",
-    "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv",
-    "decoder.model.12.convtr.convtr": "decoder.layers.12.conv",
-    "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv",
-    "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv",
-    "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv",
-    "decoder.model.15.conv.conv": "decoder.layers.15.conv",
-}
-MAPPING_DECODER_48K = {
-    "decoder.model.0.conv.norm": "decoder.layers.0.norm",
-    "decoder.model.3.convtr.norm": "decoder.layers.3.norm",
-    "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm",
-    "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm",
-    "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm",
-    "decoder.model.6.convtr.norm": "decoder.layers.6.norm",
-    "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm",
-    "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm",
-    "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm",
-    "decoder.model.9.convtr.norm": "decoder.layers.9.norm",
-    "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm",
-    "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm",
-    "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm",
-    "decoder.model.12.convtr.norm": "decoder.layers.12.norm",
-    "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm",
-    "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm",
-    "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm",
-    "decoder.model.15.conv.norm": "decoder.layers.15.norm",
-}
-MAPPING_24K = {
-    **MAPPING_QUANTIZER,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-}
-MAPPING_48K = {
-    **MAPPING_QUANTIZER,
-    **MAPPING_ENCODER,
-    **MAPPING_ENCODER_48K,
-    **MAPPING_DECODER,
-    **MAPPING_DECODER_48K,
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    elif weight_type == "weight_ih_l0":
-        hf_pointer.weight_ih_l0.data = value
-    elif weight_type == "weight_hh_l0":
-        hf_pointer.weight_hh_l0.data = value
-    elif weight_type == "bias_ih_l0":
-        hf_pointer.bias_ih_l0.data = value
-    elif weight_type == "bias_hh_l0":
-        hf_pointer.bias_hh_l0.data = value
-    elif weight_type == "weight_ih_l1":
-        hf_pointer.weight_ih_l1.data = value
-    elif weight_type == "weight_hh_l1":
-        hf_pointer.weight_hh_l1.data = value
-    elif weight_type == "bias_ih_l1":
-        hf_pointer.bias_ih_l1.data = value
-    elif weight_type == "bias_hh_l1":
-        hf_pointer.bias_hh_l1.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(orig_dict, hf_model, model_name):
-    unused_weights = []
-
-    if model_name in ["encodec_24khz", "encodec_32khz"]:
-        MAPPING = MAPPING_24K
-    elif model_name == "encodec_48khz":
-        MAPPING = MAPPING_48K
-    else:
-        raise ValueError(f"Unsupported model: {model_name}")
-
-    for name, value in orig_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            if "*" in key:
-                prefix, suffix = key.split(".*.")
-                if prefix in name and suffix in name:
-                    key = suffix
-
-            if key in name:
-                # HACK otherwise .embed gets initialized with .embed_avg too
-                if key.endswith("embed") and name.endswith("embed_avg"):
-                    continue
-
-                is_used = True
-                if "*" in mapped_key:
-                    layer_index = name.split(key)[0].split(".")[-2]
-                    mapped_key = mapped_key.replace("*", layer_index)
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "weight_ih_l0" in name:
-                    weight_type = "weight_ih_l0"
-                elif "weight_hh_l0" in name:
-                    weight_type = "weight_hh_l0"
-                elif "bias_ih_l0" in name:
-                    weight_type = "bias_ih_l0"
-                elif "bias_hh_l0" in name:
-                    weight_type = "bias_hh_l0"
-                elif "weight_ih_l1" in name:
-                    weight_type = "weight_ih_l1"
-                elif "weight_hh_l1" in name:
-                    weight_type = "weight_hh_l1"
-                elif "bias_ih_l1" in name:
-                    weight_type = "bias_ih_l1"
-                elif "bias_hh_l1" in name:
-                    weight_type = "bias_hh_l1"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "weight" in name:
-                    weight_type = "weight"
-                elif "running_mean" in name:
-                    weight_type = "running_mean"
-                elif "running_var" in name:
-                    weight_type = "running_var"
-                elif "num_batches_tracked" in name:
-                    weight_type = "num_batches_tracked"
-                else:
-                    weight_type = None
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-            continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    model_name,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = EncodecConfig.from_pretrained(config_path)
-    else:
-        config = EncodecConfig()
-
-    if model_name == "encodec_24khz":
-        pass  # config is already correct
-    elif model_name == "encodec_32khz":
-        config.upsampling_ratios = [8, 5, 4, 4]
-        config.target_bandwidths = [2.2]
-        config.num_filters = 64
-        config.sampling_rate = 32_000
-        config.codebook_size = 2048
-        config.use_causal_conv = False
-        config.normalize = False
-        config.use_conv_shortcut = False
-    elif model_name == "encodec_48khz":
-        config.upsampling_ratios = [8, 5, 4, 2]
-        config.target_bandwidths = [3.0, 6.0, 12.0, 24.0]
-        config.sampling_rate = 48_000
-        config.audio_channels = 2
-        config.use_causal_conv = False
-        config.norm_type = "time_group_norm"
-        config.normalize = True
-        config.chunk_length_s = 1.0
-        config.overlap = 0.01
-    else:
-        raise ValueError(f"Unknown model name: {model_name}")
-
-    model = EncodecModel(config)
-
-    feature_extractor = EncodecFeatureExtractor(
-        feature_size=config.audio_channels,
-        sampling_rate=config.sampling_rate,
-        chunk_length_s=config.chunk_length_s,
-        overlap=config.overlap,
-    )
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    original_checkpoint = torch.load(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-    recursively_load_weights(original_checkpoint, model, model_name)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        default="encodec_24khz",
-        type=str,
-        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.model,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
deleted file mode 100644
index 020dd4e57663..000000000000
--- a/src/transformers/models/esm/convert_esm.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ESM checkpoint."""
-
-import argparse
-import pathlib
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import esm as esm_module
-import torch
-from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences
-from esm.esmfold.v1.pretrained import esmfold_v1
-
-from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig
-from transformers.models.esm.modeling_esm import (
-    EsmForMaskedLM,
-    EsmForSequenceClassification,
-    EsmIntermediate,
-    EsmLayer,
-    EsmOutput,
-    EsmSelfAttention,
-    EsmSelfOutput,
-)
-from transformers.models.esm.modeling_esmfold import EsmForProteinFolding
-from transformers.models.esm.tokenization_esm import EsmTokenizer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_DATA = [
-    (
-        "protein1",
-        "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA",
-    ),
-    ("protein2", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"),
-    ("protein3", "MKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG"),
-    ("protein4", "MKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA"),
-]
-
-MODEL_MAPPING = {
-    "esm1b_t33_650M_UR50S": esm_module.pretrained.esm1b_t33_650M_UR50S,
-    "esm1v_t33_650M_UR90S_1": esm_module.pretrained.esm1v_t33_650M_UR90S_1,
-    "esm1v_t33_650M_UR90S_2": esm_module.pretrained.esm1v_t33_650M_UR90S_2,
-    "esm1v_t33_650M_UR90S_3": esm_module.pretrained.esm1v_t33_650M_UR90S_3,
-    "esm1v_t33_650M_UR90S_4": esm_module.pretrained.esm1v_t33_650M_UR90S_4,
-    "esm1v_t33_650M_UR90S_5": esm_module.pretrained.esm1v_t33_650M_UR90S_5,
-    "esm2_t48_15B_UR50D": esm_module.pretrained.esm2_t48_15B_UR50D,
-    "esm2_t36_3B_UR50D": esm_module.pretrained.esm2_t36_3B_UR50D,
-    "esm2_t33_650M_UR50D": esm_module.pretrained.esm2_t33_650M_UR50D,
-    "esm2_t30_150M_UR50D": esm_module.pretrained.esm2_t30_150M_UR50D,
-    "esm2_t12_35M_UR50D": esm_module.pretrained.esm2_t12_35M_UR50D,
-    "esm2_t6_8M_UR50D": esm_module.pretrained.esm2_t6_8M_UR50D,
-    "esmfold_v1": esmfold_v1,
-}
-
-restypes = list("ARNDCQEGHILKMFPSTWYV")
-
-restypes_with_x = restypes + ["X"]
-restypes_with_extras = restypes_with_x + ["<pad>", "<mask>", "<cls>", "<sep>", "<eos>"]
-
-
-def get_esmfold_tokenizer():
-    with TemporaryDirectory() as tempdir:
-        vocab = "\n".join(restypes_with_extras)
-        vocab_file = Path(tempdir) / "vocab.txt"
-        vocab_file.write_text(vocab)
-        hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file))
-    hf_tokenizer.pad_token_id = 0  # Overlaps with 'A' but that seems to be what they want
-    return hf_tokenizer
-
-
-def transfer_and_check_weights(original_module, our_module):
-    status = our_module.load_state_dict(original_module.state_dict())
-    if status.missing_keys:
-        raise ValueError(f"Missing keys: {status.missing_keys}")
-    if status.unexpected_keys:
-        raise ValueError(f"Unexpected keys: {status.unexpected_keys}")
-
-
-def convert_esm_checkpoint_to_pytorch(
-    model: str, pytorch_dump_folder_path: str, classification_head: bool, push_to_repo: str, auth_token: str
-):
-    """
-    Copy/paste/tweak esm's weights to our BERT structure.
-    """
-    if model.startswith("esmfold"):
-        esm = MODEL_MAPPING[model]()
-    else:
-        esm, alphabet = MODEL_MAPPING[model]()
-    esm.eval()  # disable dropout
-
-    if model.startswith("esmfold"):
-        embed_dim = esm.esm.embed_dim
-        num_layers = esm.esm.num_layers
-        num_attention_heads = esm.esm.attention_heads
-        intermediate_size = 4 * embed_dim
-        token_dropout = esm.esm.token_dropout
-        emb_layer_norm_before = False  # This code path does not exist in ESM-2
-        position_embedding_type = "rotary"
-        is_folding_model = True
-        esmfold_config = EsmFoldConfig()
-        for key, val in esm.cfg.items():
-            if hasattr(esmfold_config, key) and key != "trunk":
-                setattr(esmfold_config, key, val)
-        for key, val in esm.cfg.trunk.items():
-            if hasattr(esmfold_config.trunk, key) and key != "structure_module":
-                setattr(esmfold_config.trunk, key, val)
-        for key, val in esm.cfg.trunk.structure_module.items():
-            if hasattr(esmfold_config.trunk.structure_module, key):
-                setattr(esmfold_config.trunk.structure_module, key, val)
-    elif hasattr(esm, "args"):
-        # Indicates an ESM-1b or ESM-1v model
-        embed_dim = esm.args.embed_dim
-        num_layers = esm.args.layers
-        num_attention_heads = esm.args.attention_heads
-        intermediate_size = esm.args.ffn_embed_dim
-        token_dropout = esm.args.token_dropout
-        emb_layer_norm_before = True if esm.emb_layer_norm_before else False
-        position_embedding_type = "absolute"
-        is_folding_model = False
-        esmfold_config = None
-    else:
-        # Indicates an ESM-2 model
-        embed_dim = esm.embed_dim
-        num_layers = esm.num_layers
-        num_attention_heads = esm.attention_heads
-        intermediate_size = 4 * embed_dim  # This is hardcoded in ESM-2
-        token_dropout = esm.token_dropout
-        emb_layer_norm_before = False  # This code path does not exist in ESM-2
-        position_embedding_type = "rotary"
-        is_folding_model = False
-        esmfold_config = None
-
-    if is_folding_model:
-        alphabet = esm.esm.alphabet
-    vocab_list = tuple(alphabet.all_toks)
-    mask_token_id = alphabet.mask_idx
-    pad_token_id = alphabet.padding_idx
-
-    if is_folding_model:
-        original_esm_model = esm.esm
-    else:
-        original_esm_model = esm
-
-    config = EsmConfig(
-        vocab_size=original_esm_model.embed_tokens.num_embeddings,
-        mask_token_id=mask_token_id,
-        hidden_size=embed_dim,
-        num_hidden_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        intermediate_size=intermediate_size,
-        max_position_embeddings=1026,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-        attention_probs_dropout_prob=0.0,
-        hidden_dropout_prob=0.0,
-        pad_token_id=pad_token_id,
-        emb_layer_norm_before=emb_layer_norm_before,
-        token_dropout=token_dropout,
-        position_embedding_type=position_embedding_type,
-        is_folding_model=is_folding_model,
-        esmfold_config=esmfold_config,
-        vocab_list=vocab_list,
-    )
-    if classification_head:
-        config.num_labels = esm.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our ESM config:", config)
-
-    if model.startswith("esmfold"):
-        model_class = EsmForProteinFolding
-    elif classification_head:
-        model_class = EsmForSequenceClassification
-    else:
-        model_class = EsmForMaskedLM
-    model = model_class(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.esm.embeddings.word_embeddings.weight = original_esm_model.embed_tokens.weight
-    if position_embedding_type == "absolute":
-        model.esm.embeddings.position_embeddings.weight = original_esm_model.embed_positions.weight
-
-    if config.emb_layer_norm_before:
-        model.esm.embeddings.layer_norm.weight = original_esm_model.emb_layer_norm_before.weight
-        model.esm.embeddings.layer_norm.bias = original_esm_model.emb_layer_norm_before.bias
-
-    model.esm.encoder.emb_layer_norm_after.weight = original_esm_model.emb_layer_norm_after.weight
-    model.esm.encoder.emb_layer_norm_after.bias = original_esm_model.emb_layer_norm_after.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: EsmLayer = model.esm.encoder.layer[i]
-        # esm_layer: TransformerSentenceEncoderLayer = original_esm_model.layers[i]
-        esm_layer = original_esm_model.layers[i]
-
-        # self attention
-        self_attn: EsmSelfAttention = layer.attention.self
-        assert (
-            esm_layer.self_attn.k_proj.weight.data.shape
-            == esm_layer.self_attn.q_proj.weight.data.shape
-            == esm_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = esm_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = esm_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = esm_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = esm_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = esm_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = esm_layer.self_attn.v_proj.bias
-
-        if getattr(esm_layer.self_attn, "rot_emb", None) is not None:
-            # Matt: Although inv_freq is not a trainable weight, it is computed at model init and cached.
-            # During the training of ESM-2 the model was converted to float16 precision, which also converts
-            # the inv_freq tensor, and the loss of precision remains even if the model is loaded later as float32.
-            # If we recompute inv_freq without this loss of precision then we will get subtly different rotary
-            # embeddings, which are enough to cause significant discrepancies in model outputs. To avoid this,
-            # we make sure the new model copies the data from the old inv_freq.
-            self_attn.rotary_embeddings.inv_freq.data = esm_layer.self_attn.rot_emb.inv_freq
-
-        # LayerNorm changes for pre-activation
-        layer.attention.LayerNorm.weight = esm_layer.self_attn_layer_norm.weight
-        layer.attention.LayerNorm.bias = esm_layer.self_attn_layer_norm.bias
-        layer.LayerNorm.weight = esm_layer.final_layer_norm.weight
-        layer.LayerNorm.bias = esm_layer.final_layer_norm.bias
-
-        # self-attention output
-        self_output: EsmSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == esm_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = esm_layer.self_attn.out_proj.weight
-        self_output.dense.bias = esm_layer.self_attn.out_proj.bias
-
-        # intermediate
-        intermediate: EsmIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == esm_layer.fc1.weight.shape
-        intermediate.dense.weight = esm_layer.fc1.weight
-        intermediate.dense.bias = esm_layer.fc1.bias
-
-        # output
-        bert_output: EsmOutput = layer.output
-        assert bert_output.dense.weight.shape == esm_layer.fc2.weight.shape
-        bert_output.dense.weight = esm_layer.fc2.weight
-        bert_output.dense.bias = esm_layer.fc2.bias
-        # end of layer
-
-    if is_folding_model:
-        model.esm_s_combine.data = esm.esm_s_combine.data
-        model.af2_to_esm.data = esm.af2_to_esm.data
-        transfer_and_check_weights(esm.embedding, model.embedding)
-        transfer_and_check_weights(esm.esm_s_mlp, model.esm_s_mlp)
-        transfer_and_check_weights(esm.trunk, model.trunk)
-        transfer_and_check_weights(esm.distogram_head, model.distogram_head)
-        transfer_and_check_weights(esm.ptm_head, model.ptm_head)
-        transfer_and_check_weights(esm.lm_head, model.lm_head)
-        transfer_and_check_weights(esm.lddt_head, model.lddt_head)
-
-    elif classification_head:
-        model.classifier.dense.weight = esm.esm.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = esm.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = esm.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = esm.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = esm.lm_head.dense.weight
-        model.lm_head.dense.bias = esm.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = esm.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = esm.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = esm.lm_head.weight
-        model.lm_head.bias = esm.lm_head.bias
-
-    # Contact prediction head
-    transfer_and_check_weights(esm.contact_head, model.esm.contact_head)
-
-    # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
-    if is_folding_model:
-        # Folding models aren't trained on masked inputs and don't like mask tokens.
-        sample_data = SAMPLE_DATA[:2]
-    else:
-        sample_data = SAMPLE_DATA
-
-    if is_folding_model:
-        hf_tokenizer = get_esmfold_tokenizer()
-        hf_tokens = hf_tokenizer(
-            [row[1] for row in sample_data], return_tensors="pt", padding=True, add_special_tokens=False
-        )
-        esmfold_aas, esmfold_mask, _, _, _ = esmfold_encode_sequences([row[1] for row in sample_data])
-        success = torch.all(hf_tokens["input_ids"] == esmfold_aas) and torch.all(
-            hf_tokens["attention_mask"] == esmfold_mask
-        )
-    else:
-        # Let's check that we get the same results.
-        batch_converter = alphabet.get_batch_converter()
-        batch_labels, batch_strs, batch_tokens = batch_converter(sample_data)
-        # Prepare tokenizer and make sure it matches
-        with TemporaryDirectory() as tempdir:
-            vocab = "\n".join(alphabet.all_toks)
-            vocab_file = Path(tempdir) / "vocab.txt"
-            vocab_file.write_text(vocab)
-            hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file))
-
-        hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True)
-        success = torch.all(hf_tokens["input_ids"] == batch_tokens)
-
-    print("Do both models tokenizers output the same tokens?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Tokenization does not match!")
-
-    with torch.no_grad():
-        if is_folding_model:
-            # Let's test the model in parts
-            # ESMFold always converts the ESM stem to float16, which requires float16 ops
-            # that don't exist on CPU. Therefore, to test it we need to run it on GPU. However,
-            # ESMFold is what we in the community call a "big boy" and so we desperately avoid putting both the
-            # original and the converted model on the GPU at the same time.
-            their_output = esm.cuda().infer([row[1] for row in sample_data])
-            our_output = model.cuda()(
-                input_ids=hf_tokens["input_ids"].cuda(), attention_mask=hf_tokens["attention_mask"].cuda()
-            )
-        else:
-            our_output = model(**hf_tokens, output_hidden_states=True)
-            our_output = our_output["logits"]
-            if classification_head:
-                their_output = esm.model.classification_heads["mnli"](esm.extract_features(batch_tokens))
-            else:
-                their_output = esm(hf_tokens["input_ids"], repr_layers=list(range(999)))
-                their_output = their_output["logits"]
-
-        if is_folding_model:
-            max_absolute_diff = torch.max(torch.abs(our_output["positions"] - their_output["positions"])).item()
-            success = torch.allclose(our_output["positions"], their_output["positions"], atol=1e-5)
-        else:
-            max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-            success = torch.allclose(our_output, their_output, atol=1e-5)
-
-        print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-5
-        print("Do both models output the same tensors?", "🔥" if success else "💩")
-
-        if not success:
-            raise Exception("Something went wRoNg")
-
-        if not is_folding_model:
-            # Let's check contact prediction too
-            our_output = model.predict_contacts(hf_tokens["input_ids"], hf_tokens["attention_mask"])
-            their_output = esm.predict_contacts(hf_tokens["input_ids"])
-            max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-            success = torch.allclose(our_output, their_output, atol=1e-5)
-
-            print("Contact prediction testing:")
-            print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-5
-            print("Do both models output the same tensors?", "🔥" if success else "💩")
-
-            if not success:
-                raise Exception("Something went wRoNg")
-
-        pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-        del esm  # Free up some memory before continuing
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    hf_tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_repo:
-        model.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
-        hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    parser.add_argument("--model", default=None, type=str, required=True, help="Name of model to convert.")
-    parser.add_argument("--push_to_repo", type=str, help="Repo to upload to (including username!).")
-    parser.add_argument("--auth_token", type=str, help="HuggingFace auth token.")
-    args = parser.parse_args()
-    convert_esm_checkpoint_to_pytorch(
-        args.model, args.pytorch_dump_folder_path, args.classification_head, args.push_to_repo, args.auth_token
-    )
diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py
deleted file mode 100644
index 0da817c3ffa7..000000000000
--- a/src/transformers/models/falcon/convert_custom_code_checkpoint.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import json
-from argparse import ArgumentParser
-from pathlib import Path
-
-
-"""
-This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers
-library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded
-without needing trust_remote_code=True.
-"""
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=Path,
-        required=True,
-        help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.",
-    )
-    args = parser.parse_args()
-
-    if not args.checkpoint_dir.is_dir():
-        raise ValueError("--checkpoint_dir argument should be a directory!")
-
-    if (
-        not (args.checkpoint_dir / "configuration_RW.py").is_file()
-        or not (args.checkpoint_dir / "modelling_RW.py").is_file()
-    ):
-        raise ValueError(
-            "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?"
-        )
-    (args.checkpoint_dir / "configuration_RW.py").unlink()
-    (args.checkpoint_dir / "modelling_RW.py").unlink()
-
-    config = args.checkpoint_dir / "config.json"
-    text = config.read_text()
-    text = text.replace("RWForCausalLM", "FalconForCausalLM")
-    text = text.replace("RefinedWebModel", "falcon")
-    text = text.replace("RefinedWeb", "falcon")
-    json_config = json.loads(text)
-    del json_config["auto_map"]
-
-    if "n_head" in json_config:
-        json_config["num_attention_heads"] = json_config.pop("n_head")
-    if "n_layer" in json_config:
-        json_config["num_hidden_layers"] = json_config.pop("n_layer")
-    if "n_head_kv" in json_config:
-        json_config["num_kv_heads"] = json_config.pop("n_head_kv")
-        json_config["new_decoder_architecture"] = True
-    else:
-        json_config["new_decoder_architecture"] = False
-    bos_token_id = json_config.get("bos_token_id", 1)
-    eos_token_id = json_config.get("eos_token_id", 2)
-    config.unlink()
-    config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
-
-    tokenizer_config = args.checkpoint_dir / "tokenizer_config.json"
-    if tokenizer_config.is_file():
-        text = tokenizer_config.read_text()
-        json_config = json.loads(text)
-        if json_config["tokenizer_class"] == "PreTrainedTokenizerFast":
-            json_config["model_input_names"] = ["input_ids", "attention_mask"]
-            tokenizer_config.unlink()
-            tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
-
-    generation_config_path = args.checkpoint_dir / "generation_config.json"
-    generation_dict = {
-        "_from_model_config": True,
-        "bos_token_id": bos_token_id,
-        "eos_token_id": eos_token_id,
-        "transformers_version": "4.33.0.dev0",
-    }
-    generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True))
-    print("Done! Please double-check that the new checkpoint works as expected.")
diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index bb9c432f8229..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer checkpoint."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import torch
-import yaml
-
-from transformers import (
-    FastSpeech2ConformerConfig,
-    FastSpeech2ConformerModel,
-    FastSpeech2ConformerTokenizer,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-CONFIG_MAPPING = {
-    "adim": "hidden_size",
-    "aheads": "num_attention_heads",
-    "conformer_dec_kernel_size": "decoder_kernel_size",
-    "conformer_enc_kernel_size": "encoder_kernel_size",
-    "decoder_normalize_before": "decoder_normalize_before",
-    "dlayers": "decoder_layers",
-    "dunits": "decoder_linear_units",
-    "duration_predictor_chans": "duration_predictor_channels",
-    "duration_predictor_kernel_size": "duration_predictor_kernel_size",
-    "duration_predictor_layers": "duration_predictor_layers",
-    "elayers": "encoder_layers",
-    "encoder_normalize_before": "encoder_normalize_before",
-    "energy_embed_dropout": "energy_embed_dropout",
-    "energy_embed_kernel_size": "energy_embed_kernel_size",
-    "energy_predictor_chans": "energy_predictor_channels",
-    "energy_predictor_dropout": "energy_predictor_dropout",
-    "energy_predictor_kernel_size": "energy_predictor_kernel_size",
-    "energy_predictor_layers": "energy_predictor_layers",
-    "eunits": "encoder_linear_units",
-    "pitch_embed_dropout": "pitch_embed_dropout",
-    "pitch_embed_kernel_size": "pitch_embed_kernel_size",
-    "pitch_predictor_chans": "pitch_predictor_channels",
-    "pitch_predictor_dropout": "pitch_predictor_dropout",
-    "pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
-    "pitch_predictor_layers": "pitch_predictor_layers",
-    "positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
-    "postnet_chans": "speech_decoder_postnet_units",
-    "postnet_filts": "speech_decoder_postnet_kernel",
-    "postnet_layers": "speech_decoder_postnet_layers",
-    "reduction_factor": "reduction_factor",
-    "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
-    "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
-    "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
-    "transformer_dec_dropout_rate": "decoder_dropout_rate",
-    "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
-    "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
-    "transformer_enc_dropout_rate": "encoder_dropout_rate",
-    "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
-    "use_cnn_in_conformer": "use_cnn_in_conformer",
-    "use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
-    "use_masking": "use_masking",
-    "use_weighted_masking": "use_weighted_masking",
-    "idim": "input_dim",
-    "odim": "num_mel_bins",
-    "spk_embed_dim": "speaker_embed_dim",
-    "langs": "num_languages",
-    "spks": "num_speakers",
-}
-
-
-def remap_model_yaml_config(yaml_config_path):
-    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
-        args = yaml.safe_load(f)
-        args = argparse.Namespace(**args)
-
-    remapped_config = {}
-
-    model_params = args.tts_conf["text2mel_params"]
-    # espnet_config_key -> hf_config_key, any keys not included are ignored
-    for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
-        if espnet_config_key in model_params:
-            remapped_config[hf_config_key] = model_params[espnet_config_key]
-
-    return remapped_config, args.g2p, args.token_list
-
-
-def convert_espnet_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key in state_dict:
-        if "tts.generator.text2mel." in key:
-            new_key = key.replace("tts.generator.text2mel.", "")
-            if "postnet" in key:
-                new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
-                new_key = new_key.replace(".0.weight", ".conv.weight")
-                new_key = new_key.replace(".1.weight", ".batch_norm.weight")
-                new_key = new_key.replace(".1.bias", ".batch_norm.bias")
-                new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
-                new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
-                new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
-            if "feat_out" in key:
-                if "weight" in key:
-                    new_key = "speech_decoder_postnet.feat_out.weight"
-                if "bias" in key:
-                    new_key = "speech_decoder_postnet.feat_out.bias"
-            if "encoder.embed.0.weight" in key:
-                new_key = new_key.replace("0.", "")
-            if "w_1" in key:
-                new_key = new_key.replace("w_1", "conv1")
-            if "w_2" in key:
-                new_key = new_key.replace("w_2", "conv2")
-            if "predictor.conv" in key:
-                new_key = new_key.replace(".conv", ".conv_layers")
-                pattern = r"(\d)\.(\d)"
-                replacement = (
-                    r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
-                )
-                new_key = re.sub(pattern, replacement, new_key)
-            if "pitch_embed" in key or "energy_embed" in key:
-                new_key = new_key.replace("0", "conv")
-            if "encoders" in key:
-                new_key = new_key.replace("encoders", "conformer_layers")
-                new_key = new_key.replace("norm_final", "final_layer_norm")
-                new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
-                new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
-                new_key = new_key.replace("norm_ff", "ff_layer_norm")
-                new_key = new_key.replace("norm_conv", "conv_layer_norm")
-            if "lid_emb" in key:
-                new_key = new_key.replace("lid_emb", "language_id_embedding")
-            if "sid_emb" in key:
-                new_key = new_key.replace("sid_emb", "speaker_id_embedding")
-
-            new_state_dict[new_key] = state_dict[key]
-
-    return new_state_dict
-
-
-@torch.no_grad()
-def convert_FastSpeech2ConformerModel_checkpoint(
-    checkpoint_path,
-    yaml_config_path,
-    pytorch_dump_folder_path,
-    repo_id=None,
-):
-    model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
-    config = FastSpeech2ConformerConfig(**model_params)
-
-    # Prepare the model
-    model = FastSpeech2ConformerModel(config)
-
-    espnet_checkpoint = torch.load(checkpoint_path)
-    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
-
-    model.load_state_dict(hf_compatible_state_dict)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    # Prepare the tokenizer
-    with TemporaryDirectory() as tempdir:
-        vocab = {token: id for id, token in enumerate(vocab)}
-        vocab_file = Path(tempdir) / "vocab.json"
-        with open(vocab_file, "w") as f:
-            json.dump(vocab, f)
-        should_strip_spaces = "no_space" in tokenizer_name
-        tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
-
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-        tokenizer.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_FastSpeech2ConformerModel_checkpoint(
-        args.checkpoint_path,
-        args.yaml_config_path,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
deleted file mode 100644
index ec9f57ce7142..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer HiFi-GAN checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import torch
-import yaml
-
-from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-
-def load_weights(checkpoint, hf_model, config):
-    vocoder_key_prefix = "tts.generator.vocoder."
-    checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
-
-    hf_model.apply_weight_norm()
-
-    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
-    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
-    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
-
-    for i in range(len(config.upsample_rates)):
-        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
-        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
-        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
-
-    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
-        for j in range(len(config.resblock_dilation_sizes)):
-            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
-
-            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
-
-    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
-    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
-    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
-
-    hf_model.remove_weight_norm()
-
-
-def remap_hifigan_yaml_config(yaml_config_path):
-    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
-        args = yaml.safe_load(f)
-        args = argparse.Namespace(**args)
-
-    vocoder_type = args.tts_conf["vocoder_type"]
-    if vocoder_type != "hifigan_generator":
-        raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
-
-    remapped_dict = {}
-    vocoder_params = args.tts_conf["vocoder_params"]
-
-    # espnet_config_key -> hf_config_key
-    key_mappings = {
-        "channels": "upsample_initial_channel",
-        "in_channels": "model_in_dim",
-        "resblock_dilations": "resblock_dilation_sizes",
-        "resblock_kernel_sizes": "resblock_kernel_sizes",
-        "upsample_kernel_sizes": "upsample_kernel_sizes",
-        "upsample_scales": "upsample_rates",
-    }
-    for espnet_config_key, hf_config_key in key_mappings.items():
-        remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
-    remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
-    remapped_dict["normalize_before"] = False
-    remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
-
-    return remapped_dict
-
-
-@torch.no_grad()
-def convert_hifigan_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    yaml_config_path=None,
-    repo_id=None,
-):
-    if yaml_config_path is not None:
-        config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
-        config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
-    else:
-        config = FastSpeech2ConformerHifiGanConfig()
-
-    model = FastSpeech2ConformerHifiGan(config)
-
-    orig_checkpoint = torch.load(checkpoint_path)
-    load_weights(orig_checkpoint, model, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_hifigan_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.yaml_config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
deleted file mode 100644
index 2a780d5cf0b8..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    FastSpeech2ConformerConfig,
-    FastSpeech2ConformerHifiGan,
-    FastSpeech2ConformerHifiGanConfig,
-    FastSpeech2ConformerModel,
-    FastSpeech2ConformerWithHifiGan,
-    FastSpeech2ConformerWithHifiGanConfig,
-    logging,
-)
-
-from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
-    convert_espnet_state_dict_to_hf,
-    remap_model_yaml_config,
-)
-from .convert_hifigan import load_weights, remap_hifigan_yaml_config
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-
-def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
-    checkpoint_path,
-    yaml_config_path,
-    pytorch_dump_folder_path,
-    repo_id=None,
-):
-    # Prepare the model
-    model_params, *_ = remap_model_yaml_config(yaml_config_path)
-    model_config = FastSpeech2ConformerConfig(**model_params)
-
-    model = FastSpeech2ConformerModel(model_config)
-
-    espnet_checkpoint = torch.load(checkpoint_path)
-    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
-    model.load_state_dict(hf_compatible_state_dict)
-
-    # Prepare the vocoder
-    config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
-    vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
-
-    vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
-    load_weights(espnet_checkpoint, vocoder, vocoder_config)
-
-    # Prepare the model + vocoder
-    config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
-    with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
-    with_hifigan_model.model = model
-    with_hifigan_model.vocoder = vocoder
-
-    with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        with_hifigan_model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        default=None,
-        type=str,
-        help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-
-    convert_FastSpeech2ConformerWithHifiGan_checkpoint(
-        args.checkpoint_path,
-        args.yaml_config_path,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py b/src/transformers/models/flava/convert_dalle_to_flava_codebook.py
deleted file mode 100644
index 7b544125114c..000000000000
--- a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers import FlavaImageCodebook, FlavaImageCodebookConfig
-
-
-def rreplace(s, old, new, occurrence):
-    li = s.rsplit(old, occurrence)
-    return new.join(li)
-
-
-def count_parameters(state_dict):
-    # encoder.embeddings are double copied in original FLAVA
-    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
-
-
-def upgrade_state_dict(state_dict):
-    upgrade = {}
-
-    group_keys = ["group_1", "group_2", "group_3", "group_4"]
-    for key, value in state_dict.items():
-        for group_key in group_keys:
-            if group_key in key:
-                key = key.replace(f"{group_key}.", f"{group_key}.group.")
-
-        if "res_path" in key:
-            key = key.replace("res_path.", "res_path.path.")
-
-        if key.endswith(".w"):
-            key = rreplace(key, ".w", ".weight", 1)
-        if key.endswith(".b"):
-            key = rreplace(key, ".b", ".bias", 1)
-
-        upgrade[key] = value.float()
-
-    return upgrade
-
-
-@torch.no_grad()
-def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    from dall_e import Encoder
-
-    encoder = Encoder()
-    if os.path.exists(checkpoint_path):
-        ckpt = torch.load(checkpoint_path)
-    else:
-        ckpt = torch.hub.load_state_dict_from_url(checkpoint_path)
-
-    if isinstance(ckpt, Encoder):
-        ckpt = ckpt.state_dict()
-    encoder.load_state_dict(ckpt)
-
-    if config_path is not None:
-        config = FlavaImageCodebookConfig.from_pretrained(config_path)
-    else:
-        config = FlavaImageCodebookConfig()
-
-    hf_model = FlavaImageCodebook(config).eval()
-    state_dict = encoder.state_dict()
-
-    hf_state_dict = upgrade_state_dict(state_dict)
-    hf_model.load_state_dict(hf_state_dict)
-    hf_state_dict = hf_model.state_dict()
-    hf_count = count_parameters(hf_state_dict)
-    state_dict_count = count_parameters(state_dict)
-
-    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
-
-    if save_checkpoint:
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-    else:
-        return hf_state_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py b/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
deleted file mode 100644
index 95ebb2bfdb23..000000000000
--- a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers import FlavaConfig, FlavaForPreTraining
-from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint
-
-
-def count_parameters(state_dict):
-    # encoder.embeddings are double copied in original FLAVA
-    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
-
-
-def upgrade_state_dict(state_dict, codebook_state_dict):
-    upgrade = {}
-
-    for key, value in state_dict.items():
-        if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key:
-            continue
-
-        key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head")
-        key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head")
-        key = key.replace("heads.cmd.itm_head.cls", "itm_head")
-        key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler")
-        key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale")
-        key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head")
-        key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head")
-        key = key.replace("mm_text_projection", "flava.text_to_mm_projection")
-        key = key.replace("mm_image_projection", "flava.image_to_mm_projection")
-        key = key.replace("image_encoder.module", "flava.image_model")
-        key = key.replace("text_encoder.module", "flava.text_model")
-        key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token")
-        key = key.replace("mm_encoder.module", "flava.multimodal_model")
-        key = key.replace("text_projection", "flava.text_projection")
-        key = key.replace("image_projection", "flava.image_projection")
-
-        upgrade[key] = value.float()
-
-    for key, value in codebook_state_dict.items():
-        upgrade[f"image_codebook.{key}"] = value
-
-    return upgrade
-
-
-@torch.no_grad()
-def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = FlavaConfig.from_pretrained(config_path)
-    else:
-        config = FlavaConfig()
-
-    hf_model = FlavaForPreTraining(config).eval()
-
-    codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False)
-
-    if os.path.exists(checkpoint_path):
-        state_dict = torch.load(checkpoint_path, map_location="cpu")
-    else:
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu")
-
-    hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict)
-    hf_model.load_state_dict(hf_state_dict)
-    hf_state_dict = hf_model.state_dict()
-    hf_count = count_parameters(hf_state_dict)
-    state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict)
-
-    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
-    parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
deleted file mode 100644
index 71660354db14..000000000000
--- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FNet checkpoint."""
-
-import argparse
-
-import torch
-from flax.training.checkpoints import restore_checkpoint
-
-from transformers import FNetConfig, FNetForPreTraining
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_flax_checkpoint_to_pytorch(flax_checkpoint_path, fnet_config_file, save_path):
-    # Initialise PyTorch model
-    config = FNetConfig.from_json_file(fnet_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    fnet_pretraining_model = FNetForPreTraining(config)
-
-    checkpoint_dict = restore_checkpoint(flax_checkpoint_path, None)
-    pretrained_model_params = checkpoint_dict["target"]
-
-    # Embeddings
-    # Position IDs
-    state_dict = fnet_pretraining_model.state_dict()
-
-    position_ids = state_dict["fnet.embeddings.position_ids"]
-    new_state_dict = {"fnet.embeddings.position_ids": position_ids}
-    # Embedding Layers
-    new_state_dict["fnet.embeddings.word_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["word"]["embedding"]
-    )
-    new_state_dict["fnet.embeddings.position_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["position"]["embedding"][0]
-    )
-    new_state_dict["fnet.embeddings.token_type_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["type"]["embedding"]
-    )
-    new_state_dict["fnet.embeddings.projection.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["kernel"]
-    ).T
-    new_state_dict["fnet.embeddings.projection.bias"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["bias"]
-    )
-    new_state_dict["fnet.embeddings.LayerNorm.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["layer_norm"]["scale"]
-    )
-    new_state_dict["fnet.embeddings.LayerNorm.bias"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["layer_norm"]["bias"]
-    )
-
-    # Encoder Layers
-    for layer in range(config.num_hidden_layers):
-        new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["scale"]
-        )
-        new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["kernel"]
-        ).T
-        new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["kernel"]
-        ).T
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["scale"]
-        )
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["bias"]
-        )
-
-    # Pooler Layers
-    new_state_dict["fnet.pooler.dense.weight"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["kernel"]).T
-    new_state_dict["fnet.pooler.dense.bias"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["bias"])
-
-    # Masked LM Layers
-    new_state_dict["cls.predictions.transform.dense.weight"] = torch.tensor(
-        pretrained_model_params["predictions_dense"]["kernel"]
-    ).T
-    new_state_dict["cls.predictions.transform.dense.bias"] = torch.tensor(
-        pretrained_model_params["predictions_dense"]["bias"]
-    )
-    new_state_dict["cls.predictions.transform.LayerNorm.weight"] = torch.tensor(
-        pretrained_model_params["predictions_layer_norm"]["scale"]
-    )
-    new_state_dict["cls.predictions.transform.LayerNorm.bias"] = torch.tensor(
-        pretrained_model_params["predictions_layer_norm"]["bias"]
-    )
-    new_state_dict["cls.predictions.decoder.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["word"]["embedding"]
-    )
-    new_state_dict["cls.predictions.decoder.bias"] = torch.tensor(
-        pretrained_model_params["predictions_output"]["output_bias"]
-    )
-    new_state_dict["cls.predictions.bias"] = torch.tensor(pretrained_model_params["predictions_output"]["output_bias"])
-
-    # Seq Relationship Layers
-    new_state_dict["cls.seq_relationship.weight"] = torch.tensor(
-        pretrained_model_params["classification"]["output_kernel"]
-    )
-    new_state_dict["cls.seq_relationship.bias"] = torch.tensor(
-        pretrained_model_params["classification"]["output_bias"]
-    )
-
-    # Load State Dict
-    fnet_pretraining_model.load_state_dict(new_state_dict)
-
-    # Save PreTrained
-    print(f"Saving pretrained model to {save_path}")
-    fnet_pretraining_model.save_pretrained(save_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--flax_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--fnet_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained FNet model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument("--save_path", default=None, type=str, required=True, help="Path to the output model.")
-    args = parser.parse_args()
-    convert_flax_checkpoint_to_pytorch(args.flax_checkpoint_path, args.fnet_config_file, args.save_path)
diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
deleted file mode 100644
index 4aed15928062..000000000000
--- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-
-
-def get_focalnet_config(model_name):
-    depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2]
-    use_conv_embed = True if "large" in model_name or "huge" in model_name else False
-    use_post_layernorm = True if "large" in model_name or "huge" in model_name else False
-    use_layerscale = True if "large" in model_name or "huge" in model_name else False
-
-    if "large" in model_name or "xlarge" in model_name or "huge" in model_name:
-        if "fl3" in model_name:
-            focal_levels = [3, 3, 3, 3]
-            focal_windows = [5, 5, 5, 5]
-        elif "fl4" in model_name:
-            focal_levels = [4, 4, 4, 4]
-            focal_windows = [3, 3, 3, 3]
-
-    if "tiny" in model_name or "small" in model_name or "base" in model_name:
-        focal_windows = [3, 3, 3, 3]
-        if "lrf" in model_name:
-            focal_levels = [3, 3, 3, 3]
-        else:
-            focal_levels = [2, 2, 2, 2]
-
-    if "tiny" in model_name:
-        embed_dim = 96
-    elif "small" in model_name:
-        embed_dim = 96
-    elif "base" in model_name:
-        embed_dim = 128
-    elif "large" in model_name:
-        embed_dim = 192
-    elif "xlarge" in model_name:
-        embed_dim = 256
-    elif "huge" in model_name:
-        embed_dim = 352
-
-    # set label information
-    repo_id = "huggingface/label-files"
-    if "large" in model_name or "huge" in model_name:
-        filename = "imagenet-22k-id2label.json"
-    else:
-        filename = "imagenet-1k-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = FocalNetConfig(
-        embed_dim=embed_dim,
-        depths=depths,
-        focal_levels=focal_levels,
-        focal_windows=focal_windows,
-        use_conv_embed=use_conv_embed,
-        id2label=id2label,
-        label2id=label2id,
-        use_post_layernorm=use_post_layernorm,
-        use_layerscale=use_layerscale,
-    )
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "encoder.layers" in name:
-        name = name.replace("encoder.layers", "encoder.stages")
-    if "downsample.proj" in name:
-        name = name.replace("downsample.proj", "downsample.projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "modulation.f.weight" in name or "modulation.f.bias" in name:
-        name = name.replace("modulation.f", "modulation.projection_in")
-    if "modulation.h.weight" in name or "modulation.h.bias" in name:
-        name = name.replace("modulation.h", "modulation.projection_context")
-    if "modulation.proj.weight" in name or "modulation.proj.bias" in name:
-        name = name.replace("modulation.proj", "modulation.projection_out")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "focalnet." + name
-
-    return name
-
-
-def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    # fmt: off
-    model_name_to_url = {
-        "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth",
-        "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth",
-        "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth",
-        "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth",
-        "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth",
-        "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth",
-        "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth",
-        "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth",
-        "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth",
-        "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth",
-    }
-    # fmt: on
-
-    checkpoint_url = model_name_to_url[model_name]
-    print("Checkpoint URL: ", checkpoint_url)
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-
-    config = get_focalnet_config(model_name)
-    model = FocalNetForImageClassification(config)
-    model.eval()
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # verify conversion
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    processor = BitImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BILINEAR,
-        do_center_crop=True,
-        crop_size=224,
-        do_normalize=True,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = processor(images=image, return_tensors="pt")
-
-    image_transforms = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-
-    original_pixel_values = image_transforms(image).unsqueeze(0)
-
-    # verify pixel_values
-    assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4)
-
-    outputs = model(**inputs)
-
-    predicted_class_idx = outputs.logits.argmax(-1).item()
-    print("Predicted class:", model.config.id2label[predicted_class_idx])
-
-    print("First values of logits:", outputs.logits[0, :3])
-
-    if model_name == "focalnet-tiny":
-        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191])
-    elif model_name == "focalnet-tiny-lrf":
-        expected_slice = torch.tensor([1.1669, 0.0125, -0.1695])
-    elif model_name == "focalnet-small":
-        expected_slice = torch.tensor([0.4917, -0.0430, 0.1341])
-    elif model_name == "focalnet-small-lrf":
-        expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331])
-    elif model_name == "focalnet-base":
-        expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730])
-    elif model_name == "focalnet-base-lrf":
-        expected_slice = torch.tensor([0.5306, -0.0483, -0.3928])
-    assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor of {model_name} to the hub...")
-        model.push_to_hub(f"{model_name}")
-        processor.push_to_hub(f"{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="focalnet-tiny",
-        type=str,
-        help="Name of the FocalNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index ef2764f0ed10..000000000000
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Note: if you intend to run this script make sure you look under scripts/fsmt/
-# to locate the appropriate script to do the work correctly. There is a set of scripts to:
-# - download and prepare data and run the conversion script
-# - perform eval to get the best hparam into the config
-# - generate model_cards - useful if you have multiple models from the same paper
-
-import argparse
-import json
-import os
-import re
-from collections import OrderedDict
-from os.path import basename, dirname
-
-import fairseq
-import torch
-from fairseq import hub_utils
-from fairseq.data.dictionary import Dictionary
-
-from transformers import FSMTConfig, FSMTForConditionalGeneration
-from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
-from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from transformers.utils import WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_warning()
-
-json_indent = 2
-
-# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping`
-# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults:
-#
-# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users)
-# * `early_stopping`: `False` consistently scored better
-# * `length_penalty` varied, so will assign the best one depending on the model
-best_score_hparams = {
-    # fairseq:
-    "wmt19-ru-en": {"length_penalty": 1.1},
-    "wmt19-en-ru": {"length_penalty": 1.15},
-    "wmt19-en-de": {"length_penalty": 1.0},
-    "wmt19-de-en": {"length_penalty": 1.1},
-    # allenai:
-    "wmt16-en-de-dist-12-1": {"length_penalty": 0.6},
-    "wmt16-en-de-dist-6-1": {"length_penalty": 0.6},
-    "wmt16-en-de-12-1": {"length_penalty": 0.8},
-    "wmt19-de-en-6-6-base": {"length_penalty": 0.6},
-    "wmt19-de-en-6-6-big": {"length_penalty": 0.6},
-}
-
-# this remaps the different models to their organization names
-org_names = {}
-for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
-    org_names[m] = "facebook"
-for m in [
-    "wmt16-en-de-dist-12-1",
-    "wmt16-en-de-dist-6-1",
-    "wmt16-en-de-12-1",
-    "wmt19-de-en-6-6-base",
-    "wmt19-de-en-6-6-big",
-]:
-    org_names[m] = "allenai"
-
-
-def rewrite_dict_keys(d):
-    # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up,
-    # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er</w>': 7}
-    d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
-    keep_keys = "<s> <pad> </s> <unk>".split()
-    # restore the special tokens
-    for k in keep_keys:
-        del d2[f"{k}</w>"]
-        d2[k] = d[k]  # restore
-    return d2
-
-
-def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
-    # prep
-    assert os.path.exists(fsmt_checkpoint_path)
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    print(f"Writing results to {pytorch_dump_folder_path}")
-
-    # handle various types of models
-
-    checkpoint_file = basename(fsmt_checkpoint_path)
-    fsmt_folder_path = dirname(fsmt_checkpoint_path)
-
-    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
-    models = cls.hub_models()
-    kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
-    data_name_or_path = "."
-    # note: since the model dump is old, fairseq has upgraded its model some
-    # time later, and it does a whole lot of rewrites and splits on the saved
-    # weights, therefore we can't use torch.load() directly on the model file.
-    # see: upgrade_state_dict(state_dict) in fairseq_model.py
-    print(f"using checkpoint {checkpoint_file}")
-    chkpt = hub_utils.from_pretrained(
-        fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
-    )
-
-    args = vars(chkpt["args"]["model"])
-
-    src_lang = args["source_lang"]
-    tgt_lang = args["target_lang"]
-
-    data_root = dirname(pytorch_dump_folder_path)
-    model_dir = basename(pytorch_dump_folder_path)
-
-    # dicts
-    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
-    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")
-
-    src_dict = Dictionary.load(src_dict_file)
-    src_vocab = rewrite_dict_keys(src_dict.indices)
-    src_vocab_size = len(src_vocab)
-    src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
-    print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
-    with open(src_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
-
-    # detect whether this is a do_lower_case situation, which can be derived by checking whether we
-    # have at least one uppercase letter in the source vocab
-    do_lower_case = True
-    for k in src_vocab.keys():
-        if not k.islower():
-            do_lower_case = False
-            break
-
-    tgt_dict = Dictionary.load(tgt_dict_file)
-    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
-    tgt_vocab_size = len(tgt_vocab)
-    tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
-    print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
-    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))
-
-    # merges_file (bpecodes)
-    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
-    for fn in ["bpecodes", "code"]:  # older fairseq called the merges file "code"
-        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
-        if os.path.exists(fsmt_merges_file):
-            break
-    with open(fsmt_merges_file, encoding="utf-8") as fin:
-        merges = fin.read()
-    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
-    print(f"Generating {merges_file}")
-    with open(merges_file, "w", encoding="utf-8") as fout:
-        fout.write(merges)
-
-    # model config
-    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
-
-    # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe -
-    # may have to modify the tokenizer if a different type is used by a future model
-    assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
-    assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"
-
-    model_conf = {
-        "architectures": ["FSMTForConditionalGeneration"],
-        "model_type": "fsmt",
-        "activation_dropout": args["activation_dropout"],
-        "activation_function": "relu",
-        "attention_dropout": args["attention_dropout"],
-        "d_model": args["decoder_embed_dim"],
-        "dropout": args["dropout"],
-        "init_std": 0.02,
-        "max_position_embeddings": args["max_source_positions"],
-        "num_hidden_layers": args["encoder_layers"],
-        "src_vocab_size": src_vocab_size,
-        "tgt_vocab_size": tgt_vocab_size,
-        "langs": [src_lang, tgt_lang],
-        "encoder_attention_heads": args["encoder_attention_heads"],
-        "encoder_ffn_dim": args["encoder_ffn_embed_dim"],
-        "encoder_layerdrop": args["encoder_layerdrop"],
-        "encoder_layers": args["encoder_layers"],
-        "decoder_attention_heads": args["decoder_attention_heads"],
-        "decoder_ffn_dim": args["decoder_ffn_embed_dim"],
-        "decoder_layerdrop": args["decoder_layerdrop"],
-        "decoder_layers": args["decoder_layers"],
-        "bos_token_id": 0,
-        "pad_token_id": 1,
-        "eos_token_id": 2,
-        "is_encoder_decoder": True,
-        "scale_embedding": not args["no_scale_embedding"],
-        "tie_word_embeddings": args["share_all_embeddings"],
-    }
-
-    # good hparam defaults to start with
-    model_conf["num_beams"] = 5
-    model_conf["early_stopping"] = False
-    if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
-        model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
-    else:
-        model_conf["length_penalty"] = 1.0
-
-    print(f"Generating {fsmt_model_config_file}")
-    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
-
-    # tokenizer config
-    fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
-
-    tokenizer_conf = {
-        "langs": [src_lang, tgt_lang],
-        "model_max_length": 1024,
-        "do_lower_case": do_lower_case,
-    }
-
-    print(f"Generating {fsmt_tokenizer_config_file}")
-    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
-
-    # model
-    model = chkpt["models"][0]
-    model_state_dict = model.state_dict()
-
-    # rename keys to start with 'model.'
-    model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())
-
-    # remove unneeded keys
-    ignore_keys = [
-        "model.model",
-        "model.encoder.version",
-        "model.decoder.version",
-        "model.encoder_embed_tokens.weight",
-        "model.decoder_embed_tokens.weight",
-        "model.encoder.embed_positions._float_tensor",
-        "model.decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        model_state_dict.pop(k, None)
-
-    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
-    model_new = FSMTForConditionalGeneration(config)
-
-    # check that it loads ok
-    model_new.load_state_dict(model_state_dict, strict=False)
-
-    # save
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    print(f"Generating {pytorch_weights_dump_path}")
-    torch.save(model_state_dict, pytorch_weights_dump_path)
-
-    print("Conversion is done!")
-    print("\nLast step is to upload the files to s3")
-    print(f"cd {data_root}")
-    print(f"transformers-cli upload {model_dir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--fsmt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
-            " bpecodes, etc."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 37f71c0d233e..000000000000
--- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Funnel checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model):
-    # Initialise PyTorch model
-    config = FunnelConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = FunnelBaseModel(config) if base_model else FunnelModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model
-    )
-
-
-__all__ = []
diff --git a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
deleted file mode 100644
index 6d029c0d13ab..000000000000
--- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import sys
-import warnings
-
-import flatdict
-import torch
-
-from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage: # TODO fix clone links from persimmon to fuyu
-```
-git clone https://github.com/adept-ai-labs/adept-inference
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py  --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import FuyuForCausalLM, FuyuTokenizer
-
-model = FuyuForCausalLM.from_pretrained("/output/path")
-tokenizer = FuyuTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "self_attention": "self_attn",
-    "language_model.encoder": "language_model.model",
-    "word_embeddings_for_head": "language_model.lm_head",
-    "language_model.embedding.word_embeddings": "language_model.model.embed_tokens",
-    "vit_encoder.linear_encoder": "vision_embed_tokens",
-}
-
-KEYS_TO_REMOVE = {
-    "rotary_emb.inv_freq",
-    "image_patch_projection",
-    "image_patch_projection.weight",
-    "image_patch_projection.bias",
-}
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        # if KEYS_TO_REMOVE in key:
-        if key in KEYS_TO_REMOVE:
-            continue
-        model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
-    sys.path.insert(0, ada_lib_path)
-    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
-    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = FuyuConfig()
-    model = FuyuForCausalLM(transformers_config).to(torch.bfloat16)
-    model.load_state_dict(state_dict)
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Fuyu weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--pt_model_path",
-        help="Location of Fuyu `model_optim_rng.pt`",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--ada_lib_path",
-        help="Location of original source code from adept to deserialize .pt checkpoint",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
-
-    convert_fuyu_checkpoint(
-        pytorch_dump_folder_path=args.output_dir,
-        pt_model_path=args.pt_model_path,
-        safe_serialization=args.safe_serialization,
-        ada_lib_path=args.ada_lib_path,
-    )
-    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
-    tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py
deleted file mode 100644
index 9b71be35bfa1..000000000000
--- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import GemmaForCausalLM, GemmaTokenizerFast
-
-model = GemmaForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_2b_config = GemmaConfig(
-    num_hidden_layers=18,
-    num_attention_heads=8,
-    num_key_value_heads=1,
-    hidden_size=2048,
-    intermediate_size=16384,
-)
-
-gemma_7b_config = GemmaConfig()
-
-CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    num_attn_heads = config.num_attention_heads
-    hidden_size = config.hidden_size
-    num_kv_heads = config.num_key_value_heads
-    head_dim = config.head_dim
-
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-    model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"]
-    model_state_dict.pop("freqs_cis")
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        if "qkv_proj" in k:
-            if num_kv_heads == 1:
-                v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size)
-                q_proj = v[:num_attn_heads, ...]
-                k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1)
-                v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1)
-
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone()
-            else:
-                q_proj, k_proj, v_proj = torch.split(v, v.shape[0] // 3, 0)
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.clone()
-
-        elif k == "embedder.weight":
-            state_dict[LAYER_NAME_MAPPING[k]] = v
-            state_dict["lm_head.weight"] = v
-        else:
-            state_dict[k] = v
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma model.")
-    with init_empty_weights():
-        model = GemmaForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=False)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-        model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True)
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma weights.",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="7B",
-        choices=["2B", "7B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/gemma-7b",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-
-    config = CONFIG_MAPPING[args.model_size]
-    dtype = getattr(torch, args.dtype)
-    write_model(
-        config=config,
-        input_base_path=args.input_checkpoint,
-        save_path=args.output_dir,
-        safe_serialization=not args.pickle_serialization,
-        push_to_hub=args.push_to_hub,
-        dtype=dtype,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py
deleted file mode 100644
index 1ad7d23c3c3e..000000000000
--- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Gemma2ForCausalLM, GemmaTokenizerFast
-
-model = Gemma2ForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_9b_config = Gemma2Config(
-    num_hidden_layers=42,
-    num_attention_heads=16,
-    num_key_value_heads=8,
-    hidden_size=3584,
-    intermediate_size=14336,
-    final_logit_softcapping=30.0,
-    attn_logit_softcapping=50.0,
-    head_dim=256,
-    sliding_window=4096,
-    query_pre_attn_scalar=224,
-)
-
-gemma_27b_config = Gemma2Config(
-    num_hidden_layers=46,
-    num_attention_heads=32,
-    num_key_value_heads=16,
-    hidden_size=4608,
-    intermediate_size=36864,
-    final_logit_softcapping=30.0,
-    attn_logit_softcapping=50.0,
-    head_dim=128,
-    sliding_window=4096,
-    query_pre_attn_scalar=144,
-)
-
-CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    num_attn_heads = config.num_attention_heads
-    hidden_size = config.hidden_size
-    num_kv_heads = config.num_key_value_heads
-    head_dim = config.head_dim
-
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-
-    if os.path.isdir(input_base_path):
-        print("Model seems sharded")
-
-        model_state_dict = {}
-        files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")]
-
-        for file in files:
-            print(file)
-            loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu")
-            model_state_dict.update(loaded_state_dict)
-    else:
-        print("Model does not seem to be sharded")
-        model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"]
-        model_state_dict.pop("freqs_cis")
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        if "qkv_proj" in k:
-            if num_kv_heads == 1:
-                v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size)
-                q_proj = v[:num_attn_heads, ...]
-                k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1)
-                v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1)
-
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone()
-            else:
-                q_proj, k_proj, v_proj = torch.split(
-                    v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0
-                )
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-
-        elif k == "embedder.weight":
-            state_dict[LAYER_NAME_MAPPING[k]] = v
-            state_dict["lm_head.weight"] = v
-        else:
-            state_dict[k] = v
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma2 model.")
-    with init_empty_weights():
-        model = Gemma2ForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=False)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-        model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True)
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma2 weights.",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma2 tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="9B",
-        choices=["9B", "27B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/gemma-9b",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-    if not args.model_size == "tokenizer_only":
-        config = CONFIG_MAPPING[args.model_size]
-        dtype = getattr(torch, args.dtype)
-        write_model(
-            config=config,
-            input_base_path=args.input_checkpoint,
-            save_path=args.output_dir,
-            safe_serialization=not args.pickle_serialization,
-            push_to_hub=args.push_to_hub,
-            dtype=dtype,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
deleted file mode 100644
index 2f93a6b03a65..000000000000
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GIT checkpoints from the original repository.
-
-URL: https://github.com/microsoft/GenerativeImage2Text/tree/main"""
-
-import argparse
-from pathlib import Path
-
-import av
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
-
-from transformers import (
-    AutoTokenizer,
-    CLIPImageProcessor,
-    GitConfig,
-    GitForCausalLM,
-    GitProcessor,
-    GitVisionConfig,
-    VideoMAEImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_git_config(model_name):
-    if "base" in model_name and "vqa" in model_name:
-        image_size = 480
-    elif "large" in model_name and "vqa" in model_name:
-        image_size = 420
-    else:
-        image_size = 224
-
-    vision_config = GitVisionConfig(image_size=image_size)
-
-    if "large" in model_name:
-        vision_config.patch_size = 14
-        vision_config.hidden_size = 1024
-        vision_config.intermediate_size = 4096
-        vision_config.num_hidden_layers = 24
-        vision_config.num_attention_heads = 16
-
-    is_video = "vatex" in model_name or "msrvtt" in model_name
-    num_image_with_embedding = 6 if is_video else None
-    config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding)
-
-    return config, image_size, is_video
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, prefix=""):
-    rename_keys = []
-
-    # image encoder
-    # ftm: off
-    rename_keys.append(
-        (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding")
-    )
-    rename_keys.append(
-        (
-            f"{prefix}image_encoder.positional_embedding",
-            "git.image_encoder.vision_model.embeddings.position_embedding.weight",
-        )
-    )
-    rename_keys.append(
-        (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight")
-    )
-    rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight"))
-    rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias"))
-    rename_keys.append(
-        (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight")
-    )
-    rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias"))
-    # fmt: on
-    rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight"))
-
-    # fmt: off
-    for i in range(config.vision_config.num_hidden_layers):
-        # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-    # fmt: on
-
-    # text decoder
-    # fmt: off
-    rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight"))
-    rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias"))
-    rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias"))
-
-    rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight"))
-    rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias"))
-    rename_keys.append((f"{prefix}textual.output.weight", "output.weight"))
-    rename_keys.append((f"{prefix}textual.output.bias", "output.bias"))
-    for i in range(config.num_hidden_layers):
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias"))
-    # fmt: on
-
-    if config.num_image_with_embedding is not None:
-        rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0"))
-        rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1"))
-        rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2"))
-        rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3"))
-        rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4"))
-        rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5"))
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val.T if "image_encoder.visual_projection" in new else val
-
-
-# we split up the matrix of each CLIP encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, prefix=""):
-    dim = config.vision_config.hidden_size
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[
-            :dim, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            dim : dim * 2, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[
-            dim : dim * 2
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[
-            -dim:, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:]
-
-
-# We will verify our results on an image
-def prepare_img(model_name):
-    if "textvqa" in model_name:
-        filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
-        image = Image.open(filepath).convert("RGB")
-    else:
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-def prepare_video():
-    def read_video_pyav(container, indices):
-        """
-        Decode the video with PyAV decoder.
-
-        Args:
-            container (`av.container.input.InputContainer`): PyAV container.
-            indices (`List[int]`): List of frame indices to decode.
-
-        Returns:
-            result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
-        """
-        frames = []
-        container.seek(0)
-        start_index = indices[0]
-        end_index = indices[-1]
-        for i, frame in enumerate(container.decode(video=0)):
-            if i > end_index:
-                break
-            if i >= start_index and i in indices:
-                frames.append(frame)
-        return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-
-    def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
-        """
-        Sample a given number of frame indices from the video.
-
-        Args:
-            clip_len (`int`): Total number of frames to sample.
-            frame_sample_rate (`int`): Sample every n-th frame.
-            seg_len (`int`): Maximum allowed index of sample's last frame.
-
-        Returns:
-            indices (`List[int]`): List of sampled frame indices
-        """
-        converted_len = int(clip_len * frame_sample_rate)
-        end_idx = np.random.randint(converted_len, seg_len)
-        start_idx = end_idx - converted_len
-        indices = np.linspace(start_idx, end_idx, num=clip_len)
-        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
-        return indices
-
-    # set seed for reproducibility
-    np.random.seed(0)
-
-    file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset")
-    with av.open(file_path) as container:
-        # sample 6 frames
-        num_frames = 6
-        indices = sample_frame_indices(
-            clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
-        )
-        frames = read_video_pyav(container, indices)
-
-        return frames
-
-
-@torch.no_grad()
-def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our GIT structure.
-    """
-
-    model_name_to_url = {
-        "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt",
-        "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt",
-        "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt",
-        "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt",
-        "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt",  # todo
-        "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt",
-        "git-base-msrvtt-qa": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt"
-        ),
-        "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt",
-        "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt",
-        "git-large-textcaps": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt"
-        ),
-        "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt",
-        "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt",
-        "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt",
-        "git-large-msrvtt-qa": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt"
-        ),
-        "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt",
-        "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt",
-        "git-large-r-textcaps": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt"
-        ),
-    }
-
-    model_name_to_path = {
-        "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt",
-        "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt",
-        "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt",
-        "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt",
-        "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt",
-    }
-
-    # define GIT configuration based on model name
-    config, image_size, is_video = get_git_config(model_name)
-    if "large" in model_name and not is_video and "large-r" not in model_name:
-        # large checkpoints take way too long to download
-        checkpoint_path = model_name_to_path[model_name]
-        state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    else:
-        checkpoint_url = model_name_to_url[model_name]
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[
-            "model"
-        ]
-    # rename keys
-    prefix = "module." if model_name == "git-base" else ""
-    rename_keys = create_rename_keys(config, prefix=prefix)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, prefix=prefix)
-
-    # load HuggingFace model
-    model = GitForCausalLM(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    model.eval()
-
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"]
-    assert unexpected_keys == ["git.image_encoder.visual_projection.weight"]
-
-    # verify results
-    image_processor = (
-        VideoMAEImageProcessor(
-            size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size}
-        )
-        if is_video
-        else CLIPImageProcessor(
-            size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size}
-        )
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"]
-    )
-    processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    if is_video:
-        video = prepare_video()
-        pixel_values = processor(images=list(video), return_tensors="pt").pixel_values
-    else:
-        image = prepare_img(model_name)
-        image_transforms = Compose(
-            [
-                Resize(image_size, interpolation=Image.BICUBIC),
-                CenterCrop(image_size),
-                ToTensor(),
-                Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-            ]
-        )
-        original_pixel_values = image_transforms(image).unsqueeze(0)
-        pixel_values = processor(images=image, return_tensors="pt").pixel_values
-
-        assert torch.allclose(pixel_values, original_pixel_values)
-
-    input_ids = torch.tensor([[101]])
-    outputs = model(input_ids, pixel_values=pixel_values)
-    logits = outputs.logits
-    print("Logits:", logits[0, -1, :3])
-
-    if model_name == "git-base":
-        expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840])
-    elif model_name == "git-base-coco":
-        expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935])
-    elif model_name == "git-base-textcaps":
-        expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985])
-    elif model_name == "git-base-vqav2":
-        expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561])
-    elif model_name == "git-base-textvqa":
-        expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082])
-    elif model_name == "git-base-vatex":
-        expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447])
-    elif model_name == "git-base-msrvtt-qa":
-        expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540])
-    elif model_name == "git-large":
-        expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705])
-    elif model_name == "git-large-coco":
-        expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422])
-    elif model_name == "git-large-textcaps":
-        expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706])
-    elif model_name == "git-large-vqav2":
-        expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043])
-    elif model_name == "git-large-textvqa":
-        expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590])
-    elif model_name == "git-large-vatex":
-        expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113])
-    elif model_name == "git-large-msrvtt-qa":
-        expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131])
-    elif model_name == "git-large-r":
-        expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286])
-    elif model_name == "git-large-r-coco":
-        expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641])
-    elif model_name == "git-large-r-textcaps":
-        expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124])
-
-    assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4)
-    print("Looks ok!")
-
-    prompt = ""
-    if "textvqa" in model_name:
-        prompt = "what does the front of the bus say at the top?"
-    elif "msrvtt-qa" in model_name:
-        prompt = "what does the woman eat?"
-    elif "vqa" in model_name:
-        prompt = "what are the cats doing?"
-    input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
-    input_ids = [processor.tokenizer.cls_token_id] + input_ids
-    input_ids = torch.tensor(input_ids).unsqueeze(0)
-    print("Generating caption...")
-    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
-    print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor of {model_name} to the hub...")
-        model.push_to_hub(f"microsoft/{model_name}")
-        processor.push_to_hub(f"microsoft/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="git-base",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
deleted file mode 100644
index 1053f984d7f0..000000000000
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import argparse
-import json
-import os
-import re
-
-import torch
-from safetensors.torch import load_file
-from tokenizers import processors
-
-from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
-
-
-# fmt: off
-# `None` means we drop the key
-STATE_DICT_MAPPING = {
-    # CausalLM keys
-    r"transformer.output_layer.weight":                                               r"lm_head.weight",
-
-    # Model keys
-    r"transformer.embedding.word_embeddings.weight":                                  r"model.embed_tokens.weight",
-    r"transformer.rotary_pos_emb.inv_freq":                                           None,
-    r"transformer.encoder.final_layernorm.weight":                                    r"model.norm.weight",
-
-    # Layers keys
-    r"transformer.encoder.layers.(\d+).input_layernorm.weight":                       r"model.layers.\1.input_layernorm.weight",
-    r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight":              r"model.layers.\1.post_attention_layernorm.weight",
-
-    # Attention keys
-    r"transformer.encoder.layers.(\d+).self_attention.dense.weight":                  r"model.layers.\1.self_attn.o_proj.weight",
-    # qkv_proj will later be split in q|k|v|_proj
-    r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
-
-    # MLP keys
-    r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight":                     r"model.layers.\1.mlp.gate_up_proj.weight",
-    r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight":                     r"model.layers.\1.mlp.down_proj.weight",
-}
-# fmt: on
-
-
-def load_weights(input_dir: str):
-    safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
-    bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]
-
-    all_weights = {}
-
-    if safetensor_files:
-        safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
-        for file in safetensor_files:
-            tensors = load_file(file)
-            all_weights.update(tensors)
-        return all_weights
-
-    elif bin_files:
-        bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
-        for file in bin_files:
-            tensors = torch.load(file, map_location="cpu")
-            all_weights.update(tensors)
-        return all_weights
-
-    else:
-        raise ValueError("No .safetensors or .bin files found in the specified directory.")
-
-
-def map_old_key_to_new(old_key):
-    for pattern, replacement in STATE_DICT_MAPPING.items():
-        if replacement is None:
-            if re.fullmatch(pattern, old_key):
-                return None
-        else:
-            new_key, n_replace = re.subn(pattern, replacement, old_key)
-            # Early exit of the loop
-            if n_replace > 0:
-                return new_key
-
-    raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
-
-
-def convert_state_dict(original_state_dict: dict, config: GlmConfig):
-    new_dict = {}
-
-    head_dim = config.hidden_size // config.num_attention_heads
-    query_size = config.num_attention_heads * head_dim
-    kv_size = config.num_key_value_heads * head_dim
-
-    for old_key, value in original_state_dict.items():
-        new_key = map_old_key_to_new(old_key)
-        if new_key is None:
-            continue
-
-        if "qkv_proj." in new_key:
-            q_proj, k_proj, v_proj = (
-                value[:query_size, ...],
-                value[query_size : query_size + kv_size, ...],
-                value[query_size + kv_size :, ...],
-            )
-            new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
-            new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
-            new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
-        else:
-            new_dict[new_key] = value
-    return new_dict
-
-
-def convert_config(original_config: dict):
-    key_mapping = {
-        "vocab_size": "padded_vocab_size",
-        "intermediate_size": "ffn_hidden_size",
-        "num_hidden_layers": "num_layers",
-        "max_position_embeddings": "seq_length",
-        "rms_norm_eps": "layernorm_epsilon",
-        "head_dim": "kv_channels",
-        "attention_bias": "add_qkv_bias",
-    }
-    similar_keys_to_keep = [
-        "num_attention_heads",
-        "hidden_size",
-        "attention_dropout",
-        "use_cache",
-        "eos_token_id",
-        "pad_token_id",
-        "tie_word_embeddings",
-    ]
-    new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
-    new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
-    new_config_kwargs["num_key_value_heads"] = (
-        new_config_kwargs["num_attention_heads"]
-        if not original_config["multi_query_attention"]
-        else original_config["multi_query_group_num"]
-    )
-    new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
-
-    new_config = GlmConfig(**new_config_kwargs)
-    return new_config
-
-
-def convert_glm_tokenizer(input_dir, use_post_processor=False):
-    fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
-    if use_post_processor:
-        fast_tok._tokenizer.post_processor = processors.Sequence(
-            [
-                processors.ByteLevel(trim_offsets=False),
-                processors.TemplateProcessing(
-                    single="[gMASK]:0 <sop>:0 $A:0",
-                    pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
-                    special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
-                ),
-            ],
-        )
-    else:
-        fast_tok._tokenizer.post_processor = processors.Sequence(
-            [processors.ByteLevel(trim_offsets=False)],
-        )
-    return fast_tok
-
-
-def convert_glm_model(input_dir, output_dir, use_post_processor=False):
-    # Load and convert config
-    with open(os.path.join(input_dir, "config.json")) as f:
-        original_config = json.load(f)
-    config = convert_config(original_config)
-    config.save_pretrained(output_dir)
-
-    # Load and convert weights
-    original_state_dict = load_weights(input_dir)
-    new_dict = convert_state_dict(original_state_dict, config)
-    with torch.device("meta"):
-        model = GlmForCausalLM(config)
-    model.load_state_dict(new_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-    # Load and convert tokenizer
-    tokenizer = convert_glm_tokenizer(input_dir, use_post_processor)
-    tokenizer.save_pretrained(output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "input_dir",
-        type=str,
-        help="Location of the local folder copied from the Hub.",
-    )
-    parser.add_argument(
-        "output_dir",
-        type=str,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--use_post_processor",
-        action="store_true",
-        help="Whether to apply post processor with special tokens",
-    )
-
-    args = parser.parse_args()
-    convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor)
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
deleted file mode 100644
index e19ee9381980..000000000000
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GLPN checkpoints."""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if key.startswith("module.encoder"):
-            key = key.replace("module.encoder", "glpn.encoder")
-        if key.startswith("module.decoder"):
-            key = key.replace("module.decoder", "decoder.stages")
-        if "patch_embed" in key:
-            # replace for example patch_embed1 by patch_embeddings.0
-            idx = key[key.find("patch_embed") + len("patch_embed")]
-            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
-        if "norm" in key:
-            key = key.replace("norm", "layer_norm")
-        if "glpn.encoder.layer_norm" in key:
-            # replace for example layer_norm1 by layer_norm.0
-            idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")]
-            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
-        if "layer_norm1" in key:
-            key = key.replace("layer_norm1", "layer_norm_1")
-        if "layer_norm2" in key:
-            key = key.replace("layer_norm2", "layer_norm_2")
-        if "block" in key:
-            # replace for example block1 by block.0
-            idx = key[key.find("block") + len("block")]
-            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
-        if "attn.q" in key:
-            key = key.replace("attn.q", "attention.self.query")
-        if "attn.proj" in key:
-            key = key.replace("attn.proj", "attention.output.dense")
-        if "attn" in key:
-            key = key.replace("attn", "attention.self")
-        if "fc1" in key:
-            key = key.replace("fc1", "dense1")
-        if "fc2" in key:
-            key = key.replace("fc2", "dense2")
-        if "linear_pred" in key:
-            key = key.replace("linear_pred", "classifier")
-        if "linear_fuse" in key:
-            key = key.replace("linear_fuse.conv", "linear_fuse")
-            key = key.replace("linear_fuse.bn", "batch_norm")
-        if "linear_c" in key:
-            # replace for example linear_c4 by linear_c.3
-            idx = key[key.find("linear_c") + len("linear_c")]
-            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
-        if "bot_conv" in key:
-            key = key.replace("bot_conv", "0.convolution")
-        if "skip_conv1" in key:
-            key = key.replace("skip_conv1", "1.convolution")
-        if "skip_conv2" in key:
-            key = key.replace("skip_conv2", "2.convolution")
-        if "fusion1" in key:
-            key = key.replace("fusion1", "1.fusion")
-        if "fusion2" in key:
-            key = key.replace("fusion2", "2.fusion")
-        if "fusion3" in key:
-            key = key.replace("fusion3", "3.fusion")
-        if "fusion" in key and "conv" in key:
-            key = key.replace("conv", "convolutional_layer")
-        if key.startswith("module.last_layer_depth"):
-            key = key.replace("module.last_layer_depth", "head.head")
-        new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None):
-    """
-    Copy/paste/tweak model's weights to our GLPN structure.
-    """
-
-    # load GLPN configuration (Segformer-B4 size)
-    config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
-
-    # load image processor (only resize + rescale)
-    image_processor = GLPNImageProcessor()
-
-    # prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-
-    # rename keys
-    state_dict = rename_keys(state_dict)
-
-    # key and value matrices need special treatment
-    read_in_k_v(state_dict, config)
-
-    # create HuggingFace model and load state dict
-    model = GLPNForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # forward pass
-    outputs = model(pixel_values)
-    predicted_depth = outputs.predicted_depth
-
-    # verify output
-    if model_name is not None:
-        if "nyu" in model_name:
-            expected_slice = torch.tensor(
-                [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]]
-            )
-        elif "kitti" in model_name:
-            expected_slice = torch.tensor(
-                [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]]
-            )
-        else:
-            raise ValueError(f"Unknown model name: {model_name}")
-
-        expected_shape = torch.Size([1, 480, 640])
-
-        assert predicted_depth.shape == expected_shape
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    # finally, push to hub if required
-    if push_to_hub:
-        logger.info("Pushing model and image processor to the hub...")
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        default=None,
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub."
-    )
-    parser.add_argument(
-        "--model_name",
-        default="glpn-kitti",
-        type=str,
-        help="Name of the model in case you're pushing to the hub.",
-    )
-    args = parser.parse_args()
-    convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 33f9dabed07f..000000000000
--- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if gpt2_config_file == "":
-        config = GPT2Config()
-    else:
-        config = GPT2Config.from_json_file(gpt2_config_file)
-    model = GPT2Model(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--gpt2_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    args = parser.parse_args()
-    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
deleted file mode 100644
index 3db22857293c..000000000000
--- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Eleuther AI and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GPT Neo checkpoint."""
-
-import argparse
-import json
-
-from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config_json = json.load(open(config_file, "r"))
-    config = GPTNeoConfig(
-        hidden_size=config_json["n_embd"],
-        num_layers=config_json["n_layer"],
-        num_heads=config_json["n_head"],
-        attention_types=config_json["attention_types"],
-        max_position_embeddings=config_json["n_positions"],
-        resid_dropout=config_json["res_dropout"],
-        embed_dropout=config_json["embed_dropout"],
-        attention_dropout=config_json["attn_dropout"],
-    )
-    print(f"Building PyTorch model from configuration: {config}")
-    model = GPTNeoForCausalLM(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained mesh-tf model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
deleted file mode 100644
index 2625701c1a75..000000000000
--- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team and the AI-Sweden team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GPT-SW3 megatron checkpoints to pytorch"""
-
-import argparse
-import os
-from os.path import isfile
-
-import torch
-
-from transformers import GPT2Config
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace GPT2.
-    input_shape = param.size()
-    # other versions store [num_heads * num_splits * hidden_size, :]
-    saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-    param = param.view(*saved_shape)
-    param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-def convert_megatron_checkpoint(sd_megatron, config):
-    """
-    Converts a Megatron checkpoint to a HuggingFace GPT-SW3 checkpoint.
-    """
-    n_positions = config.n_positions
-    layers = config.n_layer
-    vocab_size = config.vocab_size
-    heads = config.n_head
-    hidden_size_per_head = config.n_embd // config.n_head
-
-    word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :]
-    sd_hf = {
-        "transformer.wte.weight": word_embeddings,
-        "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"],
-        "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"],
-        "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"],
-    }
-
-    pf = "model.language_model.encoder.layers."
-    for i in range(layers):
-        causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool))
-        causal_mask = causal_mask.view(1, 1, n_positions, n_positions)
-        sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask
-        sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16)
-
-        sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"]
-        sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"]
-
-        val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"]
-        val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head)
-        sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous()
-
-        val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"]
-        val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head)
-        sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2
-
-        sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose(
-            0, 1
-        )
-        sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"]
-        sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"]
-        sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"]
-        sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1)
-        sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"]
-        sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose(
-            0, 1
-        )
-        sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"]
-
-    # For LM head, transformers' wants the matrix to weight embeddings.
-    sd_hf["lm_head.weight"] = word_embeddings
-
-    return sd_hf
-
-
-def copy_config(config_hf, config_megatron):
-    """Copy the config from Megatron to hf."""
-    config_hf.vocab_size = 64000
-    config_hf.n_positions = config_megatron["encoder_seq_length"]
-    config_hf.n_embd = config_megatron["hidden_size"]
-    config_hf.n_layer = config_megatron["num_layers"]
-    config_hf.n_head = config_megatron["num_attention_heads"]
-    config_hf.n_inner = config_megatron["ffn_hidden_size"]
-    config_hf.activation_function = "gelu"
-    config_hf.resid_pdrop = 0.1
-    config_hf.embd_pdrop = 0.1
-    config_hf.attn_pdrop = 0.1
-    config_hf.layer_norm_epsilon = config_megatron["layernorm_epsilon"]  # 1e-5
-    config_hf.initializer_range = config_megatron["init_method_std"]  # 0.02
-    config_hf.apply_query_key_layer_scaling = config_megatron["apply_query_key_layer_scaling"]  # True
-    config_hf.normalize_attention_scores = True
-    config_hf.use_cache = True
-
-    # This identifies the 6.7B (7B) model which uses a different tokenizer
-    if config_megatron["hidden_size"] == 4096:
-        config_hf.bos_token_id = 1  # <|endoftext|>
-        config_hf.eos_token_id = 1  # <|endoftext|>
-        config_hf.pad_token_id = 0  # <unk>
-    else:
-        config_hf.bos_token_id = 2  # <s>
-        config_hf.eos_token_id = 3  # <|endoftext|>
-        config_hf.pad_token_id = 0  # <pad>
-
-    return config_hf
-
-
-def main(args):
-    print(args)
-
-    checkpoint_path = args.checkpoint_path
-    save_path = args.save_path
-    if isfile(checkpoint_path):
-        raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}")
-
-    # Load the model.
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    # Load the config.
-    config_megatron = checkpoint["hyper_parameters"]["cfg"]
-    config_hf = GPT2Config()
-    config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron)
-    config_hf.architectures = ["GPT2LMHeadModel"]
-
-    sd_megatron = checkpoint["state_dict"]
-
-    # Convert.
-    print("Converting")
-    sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, sd_hf)
-
-    config_hf.tokenizer_class = "GPTSw3Tokenizer"
-
-    # Store the config to file.
-    print("Saving config")
-    config_hf.save_pretrained(save_path)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(sd_hf, output_checkpoint_file)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000",
-    )
-    parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf")
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    _args = parser.parse_args()
-    main(_args)
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
deleted file mode 100644
index ac8e82bfd825..000000000000
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Grounding DINO checkpoints from the original repository.
-
-URL: https://github.com/IDEA-Research/GroundingDINO"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-from torchvision import transforms as T
-
-from transformers import (
-    AutoTokenizer,
-    GroundingDinoConfig,
-    GroundingDinoForObjectDetection,
-    GroundingDinoImageProcessor,
-    GroundingDinoProcessor,
-    SwinConfig,
-)
-
-
-IMAGENET_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_STD = [0.229, 0.224, 0.225]
-
-
-def get_grounding_dino_config(model_name):
-    if "tiny" in model_name:
-        window_size = 7
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        image_size = 224
-    elif "base" in model_name:
-        window_size = 12
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        image_size = 384
-    else:
-        raise ValueError("Model not supported, only supports base and large variants")
-
-    backbone_config = SwinConfig(
-        window_size=window_size,
-        image_size=image_size,
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        out_indices=[2, 3, 4],
-    )
-
-    config = GroundingDinoConfig(backbone_config=backbone_config)
-
-    return config
-
-
-def create_rename_keys(state_dict, config):
-    rename_keys = []
-    # fmt: off
-    ########################################## VISION BACKBONE - START
-    # patch embedding layer
-    rename_keys.append(("backbone.0.patch_embed.proj.weight",
-                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.patch_embed.proj.bias",
-                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.patch_embed.norm.weight",
-                        "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.patch_embed.norm.bias",
-                        "model.backbone.conv_encoder.model.embeddings.norm.bias"))
-
-    for layer, depth in enumerate(config.backbone_config.depths):
-        for block in range(depth):
-            # layernorms
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
-
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
-            # attention
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
-            # intermediate
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
-
-            # output
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
-
-        # downsample
-        if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
-
-    for out_indice in config.backbone_config.out_indices:
-        # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight",
-                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias",
-                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
-
-    ########################################## VISION BACKBONE - END
-
-    ########################################## ENCODER - START
-    deformable_key_mappings = {
-        'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
-        'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias',
-        'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight',
-        'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias',
-        'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight',
-        'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias',
-        'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight',
-        'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias',
-        'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight',
-        'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias',
-        'linear1.weight': 'deformable_layer.fc1.weight',
-        'linear1.bias': 'deformable_layer.fc1.bias',
-        'linear2.weight': 'deformable_layer.fc2.weight',
-        'linear2.bias': 'deformable_layer.fc2.bias',
-        'norm2.weight': 'deformable_layer.final_layer_norm.weight',
-        'norm2.bias': 'deformable_layer.final_layer_norm.bias',
-    }
-    text_enhancer_key_mappings = {
-        'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight',
-        'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias',
-        'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight',
-        'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias',
-        'linear1.weight': 'text_enhancer_layer.fc1.weight',
-        'linear1.bias': 'text_enhancer_layer.fc1.bias',
-        'linear2.weight': 'text_enhancer_layer.fc2.weight',
-        'linear2.bias': 'text_enhancer_layer.fc2.bias',
-        'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight',
-        'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias',
-        'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight',
-        'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
-    }
-    fusion_key_mappings = {
-        'gamma_v': 'fusion_layer.vision_param',
-        'gamma_l': 'fusion_layer.text_param',
-        'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
-        'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
-        'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
-        'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias',
-        'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight',
-        'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias',
-        'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight',
-        'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias',
-        'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight',
-        'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias',
-        'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight',
-        'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias',
-        'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight',
-        'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias',
-        'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
-        'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
-    }
-    for layer in range(config.encoder_layers):
-        # deformable
-        for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-        # text enhance
-        for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-        # fusion layers
-        for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-    ########################################## ENCODER - END
-
-    ########################################## DECODER - START
-    key_mappings_decoder = {
-        'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
-        'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias',
-        'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight',
-        'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias',
-        'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight',
-        'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias',
-        'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight',
-        'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias',
-        'norm1.weight': 'encoder_attn_layer_norm.weight',
-        'norm1.bias': 'encoder_attn_layer_norm.bias',
-        'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight',
-        'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias',
-        'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight',
-        'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias',
-        'catext_norm.weight': 'encoder_attn_text_layer_norm.weight',
-        'catext_norm.bias': 'encoder_attn_text_layer_norm.bias',
-        'self_attn.in_proj_weight': 'self_attn.in_proj_weight',
-        'self_attn.in_proj_bias': 'self_attn.in_proj_bias',
-        'self_attn.out_proj.weight': 'self_attn.out_proj.weight',
-        'self_attn.out_proj.bias': 'self_attn.out_proj.bias',
-        'norm2.weight': 'self_attn_layer_norm.weight',
-        'norm2.bias': 'self_attn_layer_norm.bias',
-        'linear1.weight': 'fc1.weight',
-        'linear1.bias': 'fc1.bias',
-        'linear2.weight': 'fc2.weight',
-        'linear2.bias': 'fc2.bias',
-        'norm3.weight': 'final_layer_norm.weight',
-        'norm3.bias': 'final_layer_norm.bias',
-    }
-    for layer_num in range(config.decoder_layers):
-        source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.'
-        target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
-
-        for source_name, target_name in key_mappings_decoder.items():
-            rename_keys.append((source_prefix_decoder + source_name,
-                               target_prefix_decoder + target_name))
-    ########################################## DECODER - END
-
-    ########################################## Additional - START
-    for layer_name, params in state_dict.items():
-        #### TEXT BACKBONE
-        if "bert" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
-        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
-        if "input_proj" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision")))
-        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
-        if "feat_map" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection")))
-        #### DECODER REFERENCE POINT HEAD
-        if "transformer.decoder.ref_point_head" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head",
-                                                               "model.decoder.reference_points_head")))
-        #### DECODER BBOX EMBED
-        if "transformer.decoder.bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed",
-                                                               "model.decoder.bbox_embed")))
-        if "transformer.enc_output" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
-
-        if "transformer.enc_out_bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed",
-                                                               "model.encoder_output_bbox_embed")))
-
-    rename_keys.append(("transformer.level_embed", "model.level_embed"))
-    rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
-    rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
-    rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight"))
-    ########################################## Additional - END
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v_encoder(state_dict, config):
-    ########################################## VISION BACKBONE - START
-    embed_dim = config.backbone_config.embed_dim
-    for layer, depth in enumerate(config.backbone_config.depths):
-        hidden_size = embed_dim * 2**layer
-        for block in range(depth):
-            # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"
-            ] = in_proj_weight[:hidden_size, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"
-            ] = in_proj_bias[:hidden_size]
-
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"
-            ] = in_proj_weight[hidden_size : hidden_size * 2, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"
-            ] = in_proj_bias[hidden_size : hidden_size * 2]
-
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"
-            ] = in_proj_weight[-hidden_size:, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"
-            ] = in_proj_bias[-hidden_size:]
-    ########################################## VISION BACKBONE - END
-
-
-def read_in_q_k_v_text_enhancer(state_dict, config):
-    hidden_size = config.hidden_size
-    for idx in range(config.encoder_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[
-            :hidden_size, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[
-            -hidden_size:, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[
-            -hidden_size:
-        ]
-
-
-def read_in_q_k_v_decoder(state_dict, config):
-    hidden_size = config.hidden_size
-    for idx in range(config.decoder_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-
-        state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
-
-        # read in weights + bias of cross-attention
-        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias")
-
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-def preprocess_caption(caption: str) -> str:
-    result = caption.lower().strip()
-    if result.endswith("."):
-        return result
-    return result + "."
-
-
-@torch.no_grad()
-def convert_grounding_dino_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    verify_logits = args.verify_logits
-
-    checkpoint_mapping = {
-        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
-        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
-    }
-    # Define default GroundingDino configuation
-    config = get_grounding_dino_config(model_name)
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-    original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
-
-    for name, param in original_state_dict.items():
-        print(name, param.shape)
-
-    # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(original_state_dict, config)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-    read_in_q_k_v_encoder(new_state_dict, config)
-    read_in_q_k_v_text_enhancer(new_state_dict, config)
-    read_in_q_k_v_decoder(new_state_dict, config)
-
-    # Load HF model
-    model = GroundingDinoForObjectDetection(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    # Load and process test image
-    image = prepare_img()
-    transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
-    original_pixel_values = transforms(image).unsqueeze(0)
-
-    image_processor = GroundingDinoImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    text = "a cat"
-    inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt")
-
-    assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4)
-
-    if verify_logits:
-        # Running forward
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        print(outputs.logits[0, :3, :3])
-
-        expected_slice = torch.tensor(
-            [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
-        )
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"EduardoPacheco/{model_name}")
-        processor.push_to_hub(f"EduardoPacheco/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="grounding-dino-tiny",
-        type=str,
-        choices=["grounding-dino-tiny", "grounding-dino-base"],
-        help="Name of the GroundingDino model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
-    )
-
-    args = parser.parse_args()
-    convert_grounding_dino_checkpoint(args)
diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
deleted file mode 100644
index 059f10f6129b..000000000000
--- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert GroupViT checkpoints from the original repository.
-
-URL: https://github.com/NVlabs/GroupViT
-"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel
-
-
-def rename_key(name):
-    # vision encoder
-    if "img_encoder.pos_embed" in name:
-        name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings")
-    if "img_encoder.patch_embed.proj" in name:
-        name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection")
-    if "img_encoder.patch_embed.norm" in name:
-        name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm")
-    if "img_encoder.layers" in name:
-        name = name.replace("img_encoder.layers", "vision_model.encoder.stages")
-    if "blocks" in name and "res" not in name:
-        name = name.replace("blocks", "layers")
-    if "attn" in name and "pre_assign" not in name:
-        name = name.replace("attn", "self_attn")
-    if "proj" in name and "self_attn" in name and "text" not in name:
-        name = name.replace("proj", "out_proj")
-    if "pre_assign_attn.attn.proj" in name:
-        name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj")
-    if "norm1" in name:
-        name = name.replace("norm1", "layer_norm1")
-    if "norm2" in name and "pre_assign" not in name:
-        name = name.replace("norm2", "layer_norm2")
-    if "img_encoder.norm" in name:
-        name = name.replace("img_encoder.norm", "vision_model.layernorm")
-    # text encoder
-    if "text_encoder.token_embedding" in name:
-        name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding")
-    if "text_encoder.positional_embedding" in name:
-        name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "text_encoder.transformer.resblocks." in name:
-        name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if "text_encoder" in name:
-        name = name.replace("text_encoder", "text_model")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "final_layer_norm")
-    # projection layers
-    if "img_projector.linear_hidden." in name:
-        name = name.replace("img_projector.linear_hidden.", "visual_projection.")
-    if "img_projector.linear_out." in name:
-        name = name.replace("img_projector.linear_out.", "visual_projection.3.")
-    if "text_projector.linear_hidden" in name:
-        name = name.replace("text_projector.linear_hidden", "text_projection")
-    if "text_projector.linear_out" in name:
-        name = name.replace("text_projector.linear_out", "text_projection.3")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            # weights and biases of the key, value and query projections of vision encoder's attention layers require special treatment:
-            # we need to split them up into separate matrices/vectors
-            key_split = key.split(".")
-            stage_num, layer_num = int(key_split[2]), int(key_split[4])
-            dim = config.vision_config.hidden_size
-            if "weight" in key:
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.weight"
-                ] = val[:dim, :]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.weight"
-                ] = val[dim : dim * 2, :]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.bias"
-                ] = val[-dim:]
-        elif "in_proj" in key:
-            # weights and biases of the key, value and query projections of text encoder's attention layers require special treatment:
-            # we need to split them up into separate matrices/vectors
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            dim = config.text_config.hidden_size
-            if "weight" in key:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_name = rename_key(key)
-            # squeeze if necessary
-            if (
-                "text_projection.0" in new_name
-                or "text_projection.3" in new_name
-                or "visual_projection.0" in new_name
-                or "visual_projection.3" in new_name
-            ):
-                orig_state_dict[new_name] = val.squeeze_()
-            else:
-                orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_groupvit_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False
-):
-    """
-    Copy/paste/tweak model's weights to the Transformers design.
-    """
-    config = GroupViTConfig()
-    model = GroupViTModel(config).eval()
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    new_state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    assert missing_keys == ["text_model.embeddings.position_ids"]
-    assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0)
-
-    # verify result
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    image = prepare_img()
-    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    if model_name == "groupvit-gcc-yfcc":
-        expected_logits = torch.tensor([[13.3523, 6.3629]])
-    elif model_name == "groupvit-gcc-redcaps":
-        expected_logits = torch.tensor([[16.1873, 8.6230]])
-    else:
-        raise ValueError(f"Model name {model_name} not supported.")
-    assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)
-
-    processor.save_pretrained(pytorch_dump_folder_path)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print("Successfully saved processor and model to", pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        processor.push_to_hub(model_name, organization="nielsr")
-        model.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model."
-    )
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint")
-    parser.add_argument(
-        "--model_name",
-        default="groupvit-gccy-fcc",
-        type=str,
-        help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
-    )
-    args = parser.parse_args()
-
-    convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
deleted file mode 100644
index eed27645b344..000000000000
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hiera checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/hiera
-"""
-
-import argparse
-import json
-import math
-from typing import Dict, Tuple
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool):
-    rename_keys = []
-    # fmt: off
-    num_stages = len(config.depths)
-    # embedding dimensions for input and stages
-    dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)]
-
-    global_layer_idx = 0
-    for stage_idx in range(num_stages):
-        dim_in = dims[stage_idx]
-        dim_out = dims[stage_idx + 1]
-        for layer_idx in range(config.depths[stage_idx]):
-            rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias"))
-
-            # projection layer only for the first layer of each stage boundary (except the first stage)
-            if dim_out != dim_in and layer_idx == 0:
-                rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight"))
-                rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias"))
-
-            global_layer_idx += 1
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias")
-        ]
-    )
-
-    rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings"))
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")])
-        # if just the base model, we should remove "hiera" from all keys that start with "hiera"
-        rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
-    elif mae_model:
-        rename_keys.extend(
-            [
-                ("encoder_norm.weight", "encoder_norm.weight"),
-                ("encoder_norm.bias", "encoder_norm.bias"),
-                ("mask_token", "decoder.mask_token"),
-                ("decoder_pos_embed", "decoder.decoder_position_embeddings"),
-                ("decoder_norm.weight", "decoder.decoder_norm.weight"),
-                ("decoder_norm.bias", "decoder.decoder_norm.bias"),
-                ("decoder_pred.weight", "decoder.decoder_pred.weight"),
-                ("decoder_pred.bias", "decoder.decoder_pred.bias"),
-                ("decoder_embed.weight", "decoder.decoder_embeddings.weight"),
-                ("decoder_embed.bias", "decoder.decoder_embeddings.bias")
-            ]
-        )
-        for i in range(config.decoder_depth):
-            rename_keys.extend(
-                [
-                    (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"),
-                    (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"),
-                    (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"),
-                    (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"),
-                    (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"),
-                    (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"),
-                    (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"),
-                    (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"),
-                    (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"),
-                    (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"),
-                    (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"),
-                    (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"),
-                ]
-            )
-        for i in range(config.num_query_pool):
-            rename_keys.extend(
-                [
-                    (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"),
-                    (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias")
-                ]
-            )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "hiera.pooler.layernorm.weight"),
-                ("norm.bias", "hiera.pooler.layernorm.bias"),
-                ("head.projection.weight", "classifier.weight"),
-                ("head.projection.bias", "classifier.bias"),
-            ]
-        )
-    # fmt: on
-    return rename_keys
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.projection.weight", "head.projection.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]:
-    repo_id = "huggingface/label-files"
-
-    filename = "imagenet-1k-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-    num_labels = len(id2label)
-
-    return id2label, label2id, num_labels
-
-
-def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig:
-    if model_name == "hiera-tiny-224":
-        config = HieraConfig(depths=[1, 2, 7, 2])
-    elif model_name == "hiera-small-224":
-        config = HieraConfig(depths=[1, 2, 11, 2])
-    elif model_name == "hiera-base-224":
-        config = HieraConfig()
-    elif model_name == "hiera-base-plus-224":
-        config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16])
-    elif model_name == "hiera-large-224":
-        config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4])
-    elif model_name == "hiera-huge-224":
-        config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4])
-    else:
-        raise ValueError(f"Unrecognized model name: {model_name}")
-
-    if base_model:
-        pass
-    elif mae_model:
-        config.num_query_pool = 2
-        config.decoder_hidden_size = 512
-        config.decoder_depth = 8
-        config.decoder_num_heads = 16
-        # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
-        config.mask_ratio = 0.6
-    else:
-        id2label, label2id, num_labels = get_labels_for_classifier(model_name)
-        config.id2label = id2label
-        config.label2id = label2id
-        config.num_labels = num_labels
-
-    return config
-
-
-@torch.no_grad()
-def convert_hiera_checkpoint(args):
-    model_name = args.model_name
-    base_model = args.base_model
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    mae_model = args.mae_model
-
-    config = get_hiera_config(model_name, base_model, mae_model)
-
-    # Load original hiera model
-    original_model_name = model_name.replace("-", "_")
-    original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name
-
-    original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k"
-
-    original_model = torch.hub.load(
-        "facebookresearch/hiera",
-        model=original_model_name,
-        pretrained=True,
-        checkpoint=original_checkpoint_name,
-    )
-
-    original_model.eval()
-    original_state_dict = original_model.state_dict()
-    # Don't need to remove head for MAE because original implementation doesn't have it on MAE
-    if base_model:
-        remove_classification_head_(original_state_dict)
-
-    # # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config, base_model, mae_model)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HF hiera model
-    if base_model:
-        model = HieraModel(config)
-    elif mae_model:
-        model = HieraForPreTraining(config)
-    else:
-        model = HieraForImageClassification(config)
-
-    model.eval()
-
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    input_image = prepare_img()
-
-    original_image_preprocessor = transforms.Compose(
-        [
-            transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ]
-    )
-
-    image_processor = BitImageProcessor(
-        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256}
-    )
-    inputs = image_processor(images=input_image, return_tensors="pt")
-
-    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
-
-    input_image = prepare_img()
-
-    inputs = image_processor(images=input_image, return_tensors="pt")
-    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
-    assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
-    print("Pixel values look good!")
-    print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
-
-    # If is MAE we pass a noise to generate a random mask
-    mask_spatial_shape = [
-        i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
-    ]
-    num_windows = math.prod(mask_spatial_shape)
-    torch.manual_seed(2)
-    noise = torch.rand(1, num_windows)
-    outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
-    # original implementation returns logits.softmax(dim=-1)
-
-    if base_model:
-        expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
-        expected_last_hidden = expected_intermediates[-1]
-        batch_size, _, _, hidden_dim = expected_last_hidden.shape
-        expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
-        assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3)
-        print("Base Model looks good as hidden states match original implementation!")
-        print(f"{outputs.last_hidden_state[0, :3, :3]=}")
-    elif mae_model:
-        # get mask from noise to be able to compare outputs
-        mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
-        expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
-        assert torch.allclose(outputs.loss, expected_loss, atol=1e-3)
-        print("MAE Model looks good as loss matches original implementation!")
-    else:
-        expected_prob = original_model(expected_pixel_values)
-        assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3)
-        print("Classifier looks good as probs match original implementation")
-        print(f"{outputs.logits[:, :5]=}")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        hub_name = model_name
-        if base_model:
-            hub_name = model_name
-        elif mae_model:
-            hub_name = f"{model_name}-mae"
-        else:
-            hub_name = f"{model_name}-in1k"
-        repo_id = f"EduardoPacheco/{hub_name}"
-        print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
-        model.push_to_hub(repo_id)
-        image_processor.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model-name",
-        default="hiera-tiny-224",
-        type=str,
-        choices=[
-            "hiera-tiny-224",
-            "hiera-small-224",
-            "hiera-base-224",
-            "hiera-base-plus-224",
-            "hiera-large-224",
-            "hiera-huge-224",
-        ],
-        help="Name of the Hiera model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify-logits",
-        action="store_true",
-        help="Whether or not to verify the logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--base-model",
-        action="store_true",
-        help="Whether to only convert the base model (no projection head weights).",
-    )
-    parser.add_argument(
-        "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining."
-    )
-
-    args = parser.parse_args()
-    convert_hiera_checkpoint(args)
diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index f5914f35c546..000000000000
--- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-from s3prl.hub import distilhubert
-
-from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = mapped_key
-
-                if key in name:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model):
-    config = HubertConfig()
-    fs_config = model.config
-
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = False
-    config.attention_dropout = fs_config.attention_dropout
-    config.conv_bias = False
-    conv_layers = eval(fs_config.extractor_conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.feat_proj_layer_norm = False
-    config.feat_proj_dropout = 0.0
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn
-    config.hidden_dropout = fs_config.dropout
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = 0.0
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-
-    return config
-
-
-@torch.no_grad()
-def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    model = distilhubert().model.model
-
-    if config_path is not None:
-        config = HubertConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model)
-    model = model.eval()
-
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=False,
-        return_attention_mask=False,
-    )
-    hf_model = HubertModel(config)
-
-    recursively_load_weights(model, hf_model)
-
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-    convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4966340493f3..000000000000
--- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    HubertConfig,
-    HubertForCTC,
-    HubertModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm",
-    "encoder.pos_conv.1": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned):
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_hubert_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = HubertConfig.from_pretrained(config_path)
-    else:
-        config = HubertConfig()
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = HubertForCTC(config)
-    else:
-        hf_wav2vec = HubertModel(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_hubert_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index ff15b90088af..000000000000
--- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SUPPORTED_MODELS = ["UtteranceLevel"]
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS:
-        raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_congfig = HubertConfig.from_pretrained(config_path)
-    hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    if hf_congfig.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_model.projector.weight.data = downstream_dict["projector.weight"]
-    hf_model.projector.bias.data = downstream_dict["projector.bias"]
-    hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py
deleted file mode 100644
index ea44ee11e58c..000000000000
--- a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import copy
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Idefics2Config,
-    Idefics2ForConditionalGeneration,
-    Idefics2ImageProcessor,
-    Idefics2Processor,
-    MistralConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "lm_head.weight": "lm_head.linear.weight",
-    "model.layers": "model.text_model.layers",
-    "model.norm": "model.text_model.norm",
-    "model.perceiver_resampler": "model.connector.perceiver_resampler",
-    "model.modality_projection": "model.connector.modality_projection",
-}
-
-
-WEIGHTS_TO_MERGE_MAPPING = (
-    # (weights to merge in merging order), (new weight name)
-    (
-        ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
-        "model.text_model.embed_tokens.weight",
-    ),
-    (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
-)
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def merge_weights(state_dict):
-    new_state_dict = copy.deepcopy(state_dict)
-
-    # Merge the weights
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            assert weight in state_dict, f"Weight {weight} is missing in the state dict"
-            if new_weight_name not in new_state_dict:
-                new_state_dict[new_weight_name] = [state_dict[weight]]
-            else:
-                new_state_dict[new_weight_name].append(state_dict[weight])
-        new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
-
-    # Remove the weights that were merged
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            if weight in new_state_dict and weight != new_weight_name:
-                new_state_dict.pop(weight)
-
-    return new_state_dict
-
-
-def get_config(checkpoint):
-    if checkpoint == "HuggingFaceM4/idefics2":
-        # We load the config then recreate to use the text_config
-        config = AutoConfig.from_pretrained(checkpoint)
-        text_config = MistralConfig(
-            vocab_size=config.vocab_size + config.additional_vocab_size,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            num_hidden_layers=config.num_hidden_layers,
-            num_attention_heads=config.num_attention_heads,
-            num_key_value_heads=config.num_key_value_heads,
-            hidden_act=config.hidden_act,
-            max_position_embeddings=config.max_position_embeddings,
-            initializer_range=config.initializer_range,
-            rms_norm_eps=config.rms_norm_eps,
-            tie_word_embeddings=config.tie_word_embeddings,
-            rope_theta=config.rope_theta,
-            sliding_window=config.sliding_window,
-            attention_dropout=config.attention_dropout,
-            pad_token_id=config.pad_token_id,
-            bos_token_id=config.bos_token_id,
-            eos_token_id=config.eos_token_id,
-        )
-        perceiver_config = config.perceiver_config.to_dict()
-        config = Idefics2Config(
-            text_config=text_config.to_dict(),
-            vision_config=config.vision_config,
-            perceiver_config=perceiver_config,
-            use_cache=config.use_cache,
-            image_token_id=config.image_token_id,
-            tie_word_embeddings=config.tie_word_embeddings,
-        )
-        return config
-
-    return AutoConfig.from_pretrained(checkpoint)
-
-
-def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
-    # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration
-    original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True)
-    # The original model doesn't use the idefics2 processing objects
-    image_seq_len = original_model.config.perceiver_config.resampler_n_latents
-    image_processor = Idefics2ImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained(original_model_id)
-    processor = Idefics2Processor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        image_seq_len=image_seq_len,
-    )
-    state_dict = original_model.state_dict()
-    state_dict = convert_state_dict_to_hf(state_dict)
-
-    # Merge weights
-    state_dict = merge_weights(state_dict)
-
-    config = get_config(original_model_id)
-
-    with init_empty_weights():
-        model = Idefics2ForConditionalGeneration(config)
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    model.save_pretrained(output_hub_path)
-    processor.save_pretrained(output_hub_path)
-
-    if push_to_hub:
-        model.push_to_hub(output_hub_path, private=True)
-        processor.push_to_hub(output_hub_path, private=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--original_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="If set, the model will be pushed to the hub after conversion.",
-    )
-    args = parser.parse_args()
-    convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
deleted file mode 100644
index 204104a58b30..000000000000
--- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Idefics3Config,
-    Idefics3ForConditionalGeneration,
-    Idefics3ImageProcessor,
-    Idefics3Processor,
-    LlamaConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "lm_head.weight": "lm_head.linear.weight",
-    "model.layers": "model.text_model.layers",
-    "model.norm": "model.text_model.norm",
-    "model.modality_projection": "model.connector.modality_projection",
-}
-
-
-WEIGHTS_TO_MERGE_MAPPING = (
-    # (weights to merge in merging order), (new weight name)
-    (
-        ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
-        "model.text_model.embed_tokens.weight",
-    ),
-    (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
-)
-
-WEIGHTS_TO_DROP = (
-    # The original model had a vision head, but this is never used
-    "model.vision_model.head",
-)
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    old_state_dict_keys = set(state_dict.keys())
-
-    # Flattened list of weights to merge. We keep these in the original state dict to merge them later
-    original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]]
-
-    # for key, value in state_dict.items():
-    for old_key in old_state_dict_keys:
-        if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP):
-            state_dict.pop(old_key)
-            continue
-
-        key = old_key
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        weight = state_dict.pop(old_key)
-        if key in original_weights_to_merge:
-            new_state_dict[key] = weight
-            # Bit of a hack - we need to keep the original weights to merge them later
-            state_dict[key] = weight
-        else:
-            new_state_dict[key] = weight
-
-    return new_state_dict
-
-
-def merge_weights(state_dict, new_state_dict):
-    old_weight_names = set(state_dict.keys())
-
-    # Merge the weights
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight_to_merge in weights_to_merge:
-            print(weight_to_merge)
-            assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict"
-
-            weight = state_dict.pop(weight_to_merge)
-            if new_weight_name not in new_state_dict:
-                new_state_dict[new_weight_name] = [weight]
-            else:
-                new_state_dict[new_weight_name].append(weight)
-
-            old_weight_names.remove(weight_to_merge)
-
-        new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
-
-    # Remove the weights that were merged
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            if weight in new_state_dict and weight != new_weight_name:
-                new_state_dict.pop(weight)
-
-    return new_state_dict
-
-
-def get_config(checkpoint):
-    # We load the config then recreate to use the text_config
-
-    # download the config file
-    filepath = hf_hub_download(repo_id=checkpoint, filename="config.json")
-    with open(filepath, "r") as f:
-        config_json = json.load(f)
-
-    # Setup the vision config
-    vision_config = config_json.pop("vision_config")
-    vision_config.pop("vision_model_name", None)
-    if "embed_dim" in vision_config:
-        vision_config["hidden_size"] = vision_config.pop("embed_dim")
-
-    config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size")
-
-    image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2)
-    use_cache = config_json.pop("use_cache", True)
-    tie_word_embeddings = config_json.pop("tie_word_embeddings", True)
-    scale_factor = config_json.pop("scale_factor", 2)
-    vocab_size = config_json.pop("vocab_size", 100000)
-
-    # Remove "freeze" params from the config
-    config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")}
-    text_config = LlamaConfig(**config_json)
-
-    config = Idefics3Config(
-        text_config=text_config,
-        vision_config=vision_config,
-        use_cache=use_cache,
-        image_token_id=image_token_id,
-        tie_word_embeddings=tie_word_embeddings,
-        scale_factor=scale_factor,
-        vocab_size=vocab_size,
-    )
-    return config
-
-
-def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
-    # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration
-    original_model = AutoModelForCausalLM.from_pretrained(
-        original_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16
-    )
-    # The original model doesn't use the Idefics3 processing objects
-    image_processor = Idefics3ImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained(original_model_id)
-    processor = Idefics3Processor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-    )
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict_to_hf(state_dict)
-
-    # Merge weights
-    new_state_dict = merge_weights(state_dict, new_state_dict)
-    del state_dict
-
-    config = get_config(original_model_id)
-    print(config)
-
-    with init_empty_weights():
-        model = Idefics3ForConditionalGeneration(config)
-
-    model.load_state_dict(new_state_dict, strict=True, assign=True)
-
-    model.save_pretrained(output_hub_path)
-    processor.save_pretrained(output_hub_path)
-
-    if push_to_hub:
-        model.push_to_hub(output_hub_path, private=True)
-        processor.push_to_hub(output_hub_path, private=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--original_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="If set, the model will be pushed to the hub after conversion.",
-    )
-    args = parser.parse_args()
-    convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py
deleted file mode 100644
index 5c15a72ff888..000000000000
--- a/src/transformers/models/ijepa/convert_ijepa_to_hf.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert IJEPA checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ijepa
-"""
-
-import argparse
-import gc
-import re
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    IJepaConfig,
-    IJepaModel,
-    ViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# fmt: off
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    # Projection layer + position embeddings
-    r"pos_embed":                               r"embeddings.position_embeddings",
-    r"patch_embed.proj.weight":                 r"embeddings.patch_embeddings.projection.weight",
-    r"patch_embed.proj.bias":                   r"embeddings.patch_embeddings.projection.bias",
-
-    # Encoder layers: Layernorms, Attention, Feedforward layers
-    r"blocks.(\d+).norm1.weight":               r"encoder.layer.\1.layernorm_before.weight",
-    r"blocks.(\d+).norm1.bias":                 r"encoder.layer.\1.layernorm_before.bias",
-    r"blocks.(\d+).attn.proj.weight":           r"encoder.layer.\1.attention.output.dense.weight",
-    r"blocks.(\d+).attn.proj.bias":             r"encoder.layer.\1.attention.output.dense.bias",
-    r"blocks.(\d+).norm2.weight":               r"encoder.layer.\1.layernorm_after.weight",
-    r"blocks.(\d+).norm2.bias":                 r"encoder.layer.\1.layernorm_after.bias",
-    r"blocks.(\d+).mlp.fc1.weight":             r"encoder.layer.\1.intermediate.dense.weight",
-    r"blocks.(\d+).mlp.fc1.bias":               r"encoder.layer.\1.intermediate.dense.bias",
-    r"blocks.(\d+).mlp.fc2.weight":             r"encoder.layer.\1.output.dense.weight",
-    r"blocks.(\d+).mlp.fc2.bias":               r"encoder.layer.\1.output.dense.bias",
-
-    # Layernorm + pooler
-    r"norm.weight":                             r"layernorm.weight",
-    r"norm.bias":                               r"layernorm.bias",
-}
-# fmt: on
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary.
-
-    Args:
-        state_dict_keys (dict): The keys from the state_dict to convert.
-
-    Returns:
-        dict: A mapping from old keys to new keys.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-
-        # Apply regex-based mapping
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # Skip the key
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-
-    return output_dict
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_ijepa_config(model_name):
-    patch_size = int(model_name.split("_")[1][4:])
-    config = IJepaConfig(patch_size=patch_size)
-    if "vith" in model_name:
-        config.hidden_size = 1280
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-        config.layer_norm_eps = 1e-6
-        config.mlp_ratio = 4
-        config.intermediate_size = 5120
-        if model_name == "ijepa_vith16_1k":
-            config.image_size = 448
-    elif "vitg" in model_name:
-        config.hidden_size = 1408
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 16
-        config.layer_norm_eps = 1e-6
-        config.mlp_ratio = 48 / 11
-        config.intermediate_size = 6144
-    else:
-        raise ValueError("Model not supported, only supports huge and giant models.")
-    return config
-
-
-@torch.no_grad()
-def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our IJEPA structure.
-    """
-
-    # define default IJEPA configuration
-    config = get_ijepa_config(model_name)
-
-    checkpoint_mapping = {
-        "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar",
-        "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar",
-        "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar",
-        "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar",
-    }
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"]
-    original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
-
-    # Rename keys
-    state_dict = original_state_dict.copy()
-    new_keys = convert_old_keys_to_new_keys(state_dict.keys())
-    for old_key, new_key in new_keys.items():
-        rename_key(state_dict, old_key, new_key)
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = IJepaModel(config, add_pooling_layer=False).eval()
-    model.load_state_dict(state_dict)
-    size = {"height": config.image_size, "width": config.image_size}
-    image_processor = ViTImageProcessor(size=size)
-
-    if verify_logits:
-        # Check outputs on an image, prepared by ViTImageProcessor
-        encoding = image_processor(images=prepare_img(), return_tensors="pt")
-        pixel_values = encoding["pixel_values"]
-        with torch.no_grad():
-            outputs = model(pixel_values)
-
-        expected_slices = {
-            "ijepa_vith14_1k": torch.Tensor(
-                [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]]
-            ),
-            "ijepa_vith14_22k": torch.Tensor(
-                [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]]
-            ),
-            "ijepa_vith16_1k": torch.Tensor(
-                [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]]
-            ),
-            "ijepa_vitg16_22k": torch.Tensor(
-                [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]]
-            ),
-        }
-
-        assert torch.allclose(
-            expected_slices[model_name],
-            outputs.last_hidden_state[0, :3, :3],
-            atol=1e-4,
-        )
-
-    if output_dir:
-        Path(output_dir).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {output_dir}")
-        image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization)
-        model.save_pretrained(output_dir, safe_serialization=safe_serialization)
-
-    if push_to_hub:
-        image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization)
-        model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization)
-
-    if output_dir:
-        del model, state_dict
-        gc.collect()
-        print("Reloading the model to check if it's saved correctly.")
-        IJepaModel.from_pretrained(output_dir, device_map="auto")
-        print("Model reloaded successfully.")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ijepa_vith14_1k",
-        type=str,
-        choices=[
-            "ijepa_vith14_1k",
-            "ijepa_vith14_22k",
-            "ijepa_vith16_1k",
-            "ijepa_vitg16_22k",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the model to the 🤗 Hub.",
-    )
-    parser.add_argument(
-        "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
-    )
-
-    parser.set_defaults()
-    args = parser.parse_args()
-    write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
deleted file mode 100644
index 182d66b9af28..000000000000
--- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI Image GPT checkpoints."""
-
-import argparse
-
-import torch
-
-from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path):
-    # Construct configuration depending on size
-    MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)}
-    n_embd, n_head, n_layer = MODELS[model_size]  # set model hyperparameters
-    config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head)
-    model = ImageGPTForCausalLM(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--imagegpt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--model_size",
-        default=None,
-        type=str,
-        required=True,
-        help="Size of the model (can be either 'small', 'medium' or 'large').",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_imagegpt_checkpoint_to_pytorch(
-        args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
deleted file mode 100644
index f8b9c86cfddc..000000000000
--- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert InstructBLIP checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
-# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
-# same for Vicuna-13b
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BlipImageProcessor,
-    InstructBlipConfig,
-    InstructBlipForConditionalGeneration,
-    InstructBlipProcessor,
-    InstructBlipQFormerConfig,
-    InstructBlipVisionConfig,
-    LlamaConfig,
-    LlamaTokenizerFast,
-    T5Config,
-    T5TokenizerFast,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "vicuna-7b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
-    elif "vicuna-13b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
-    else:
-        raise ValueError("Model name not supported")
-
-    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
-    qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
-    config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
-    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-
-    if "t5" in model_name:
-        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
-    elif "vicuna" in model_name:
-        # the following was used in the original implementation:
-        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
-        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        # tokenizer.add_special_tokens({"bos_token": "</s>"})
-        # tokenizer.add_special_tokens({"eos_token": "</s>"})
-        # tokenizer.add_special_tokens({"unk_token": "</s>"})
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
-        )
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    config, image_size = get_blip2_config(model_name)
-    hf_model = InstructBlipForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
-        "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
-        "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
-        "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
-    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "llm_proj" in key:
-            key = key.replace("llm_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("llm_model"):
-            key = key.replace("llm_model", "language_model")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    # note: weights get loaded in torch.float32 by default
-    hf_model.load_state_dict(state_dict, strict=True)
-
-    image = load_demo_image()
-    prompt = "What is unusual about this image?"
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = InstructBlipProcessor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        qformer_tokenizer=qformer_tokenizer,
-    )
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-    pixel_values = inputs.pixel_values
-    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-    with torch.no_grad():
-        if "vicuna" in model_name:
-            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
-            logits = hf_model(**inputs).logits
-        else:
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
-            ).logits
-            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
-            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
-            logits = hf_model(**inputs, labels=labels).logits
-
-    print("First values of original logits:", original_logits[0, :3, :3])
-    print("First values of HF logits:", logits[0, :3, :3])
-
-    # assert values
-    assert original_logits.shape == logits.shape
-    atol = 1e-4 if "vicuna" in model_name else 1e-5
-    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
-    print("Looks ok!")
-
-    print("Generating with original model...")
-    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
-
-    # important: we need to cast the weights of the HF model to the appropriate type
-    print("Generating with HF model...")
-    outputs = hf_model.generate(
-        **inputs,
-        do_sample=False,
-        num_beams=5,
-        max_length=256,
-        min_length=1,
-        top_p=0.9,
-        repetition_penalty=1.5,
-        length_penalty=1.0,
-        temperature=1,
-    )
-    if "vicuna" in model_name:
-        # convert output id 0 to 2 (eos_token_id)
-        # TODO add this in the generate method?
-        outputs[outputs == 0] = 2
-    print("Original generation:", original_outputs)
-    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-    output_text = [text.strip() for text in output_text]
-    print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"Salesforce/{model_name}")
-        hf_model.push_to_hub(f"Salesforce/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "instructblip-vicuna-7b",
-        "instructblip-vicuna-13b",
-        "instructblip-flan-t5-xl",
-        "instructblip-flan-t5-xxl",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="instructblip-flan-t5-xl",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py
deleted file mode 100644
index 9b3d508db6ff..000000000000
--- a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert InstructBlipVideo checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
-# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
-# same for Vicuna-13b
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BlipImageProcessor,
-    InstructBlipProcessor,
-    InstructBlipVideoConfig,
-    InstructBlipVideoForConditionalGeneration,
-    InstructBlipVideoQFormerConfig,
-    InstructBlipVideoVisionConfig,
-    LlamaConfig,
-    LlamaTokenizerFast,
-    T5Config,
-    T5TokenizerFast,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "vicuna-7b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
-    elif "vicuna-13b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
-    else:
-        raise ValueError("Model name not supported")
-
-    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
-    qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict()
-    config = InstructBlipVideoConfig(
-        vision_config=vision_config, text_config=text_config, qformer_config=qformer_config
-    )
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
-    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-
-    if "t5" in model_name:
-        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
-    elif "vicuna" in model_name:
-        # the following was used in the original implementation:
-        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
-        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        # tokenizer.add_special_tokens({"bos_token": "</s>"})
-        # tokenizer.add_special_tokens({"eos_token": "</s>"})
-        # tokenizer.add_special_tokens({"unk_token": "</s>"})
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
-        )
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    config, image_size = get_blip2_config(model_name)
-    hf_model = InstructBlipVideoForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
-        "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
-        "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
-        "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
-    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "llm_proj" in key:
-            key = key.replace("llm_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("llm_model"):
-            key = key.replace("llm_model", "language_model")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    # note: weights get loaded in torch.float32 by default
-    hf_model.load_state_dict(state_dict, strict=True)
-
-    image = load_demo_image()
-    prompt = "What is unusual about this image?"
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = InstructBlipProcessor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        qformer_tokenizer=qformer_tokenizer,
-    )
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-    pixel_values = inputs.pixel_values
-    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-    with torch.no_grad():
-        if "vicuna" in model_name:
-            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
-            logits = hf_model(**inputs).logits
-        else:
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
-            ).logits
-            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
-            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
-            logits = hf_model(**inputs, labels=labels).logits
-
-    print("First values of original logits:", original_logits[0, :3, :3])
-    print("First values of HF logits:", logits[0, :3, :3])
-
-    # assert values
-    assert original_logits.shape == logits.shape
-    atol = 1e-4 if "vicuna" in model_name else 1e-5
-    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
-    print("Looks ok!")
-
-    print("Generating with original model...")
-    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
-
-    # important: we need to cast the weights of the HF model to the appropriate type
-    print("Generating with HF model...")
-    outputs = hf_model.generate(
-        **inputs,
-        do_sample=False,
-        num_beams=5,
-        max_length=256,
-        min_length=1,
-        top_p=0.9,
-        repetition_penalty=1.5,
-        length_penalty=1.0,
-        temperature=1,
-    )
-    if "vicuna" in model_name:
-        # convert output id 0 to 2 (eos_token_id)
-        # TODO add this in the generate method?
-        outputs[outputs == 0] = 2
-    print("Original generation:", original_outputs)
-    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-    output_text = [text.strip() for text in output_text]
-    print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"Salesforce/{model_name}")
-        hf_model.push_to_hub(f"Salesforce/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "instructblipvideo-vicuna-7b",
-        "instructblipvideo-vicuna-13b",
-        "instructblipvideo-flan-t5-xl",
-        "instructblipvideo-flan-t5-xxl",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="instructblipvideo-flan-t5-xl",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 04c7712aa846..000000000000
--- a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import argparse
-
-from fairseq.checkpoint_utils import load_checkpoint_to_cpu
-
-from transformers import Kosmos2Config, Kosmos2ForConditionalGeneration
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "gpt_model.decoder.output_projection": "text_model.lm_head",
-    "gpt_model.decoder": "text_model.model",
-    "img_connector": "image_to_text_projection",
-    "img_model.visual.class_embedding": "vision_model.model.embeddings.class_embedding",
-    "img_model.visual.positional_embedding": "vision_model.model.embeddings.position_embedding.weight",
-    "img_model.visual.conv1": "vision_model.model.embeddings.patch_embedding",
-    "img_model.visual": "vision_model.model",
-    "ln_pre": "pre_layrnorm",
-    "ln_post": "post_layernorm",
-    "transformer.resblocks": "encoder.layers",
-    "ts_attn": "self_attn",
-    "ln_1": "layer_norm1",
-    "ln_2": "layer_norm2",
-    "c_fc": "fc1",
-    "c_proj": "fc2",
-}
-
-
-KEYS_TO_IGNORE = [
-    # this buffer in the original code is only used to send weights to the desired device
-    "gpt_model.decoder.embed_positions._float_tensor",
-    # this weight is never used in the forward in the original KOSMOS-2)
-    "gpt_model.decoder.self_attn_sope.scale",
-]
-
-
-def rename_key(key):
-    for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-        if key_to_modify in key:
-            key = key.replace(key_to_modify, new_key)
-
-    return key
-
-
-def convert_kosmos2_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path):
-    state = load_checkpoint_to_cpu(checkpoint_path)
-    state_dict = state["model"]
-    state_dict_keys = list(state_dict.keys())
-
-    config = Kosmos2Config()
-    # This is necessary to match the results given by the original demo
-    config.text_config.no_repeat_ngram_size = 3
-    model = Kosmos2ForConditionalGeneration(config)
-
-    # convert (by renaming keys)
-    converted_state_dict = {}
-    for key in state_dict_keys:
-        if key in KEYS_TO_IGNORE:
-            continue
-        renamed_key = rename_key(key)
-        converted_state_dict[renamed_key] = state_dict[key]
-
-    # check weight loading
-    model.load_state_dict(converted_state_dict, strict=True)
-    # save the result
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--kosmos2_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_kosmos2_checkpoint_to_pytorch(args.kosmos2_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
deleted file mode 100644
index afef3f73de6c..000000000000
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LeViT checkpoints from timm."""
-
-import argparse
-import json
-from collections import OrderedDict
-from functools import partial
-from pathlib import Path
-
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-def convert_weight_and_push(
-    hidden_sizes: int, name: str, config: LevitConfig, save_directory: Path, push_to_hub: bool = True
-):
-    print(f"Converting {name}...")
-
-    with torch.no_grad():
-        if hidden_sizes == 128:
-            if name[-1] == "S":
-                from_model = timm.create_model("levit_128s", pretrained=True)
-            else:
-                from_model = timm.create_model("levit_128", pretrained=True)
-        if hidden_sizes == 192:
-            from_model = timm.create_model("levit_192", pretrained=True)
-        if hidden_sizes == 256:
-            from_model = timm.create_model("levit_256", pretrained=True)
-        if hidden_sizes == 384:
-            from_model = timm.create_model("levit_384", pretrained=True)
-
-        from_model.eval()
-        our_model = LevitForImageClassificationWithTeacher(config).eval()
-        huggingface_weights = OrderedDict()
-
-        weights = from_model.state_dict()
-        og_keys = list(from_model.state_dict().keys())
-        new_keys = list(our_model.state_dict().keys())
-        print(len(og_keys), len(new_keys))
-        for i in range(len(og_keys)):
-            huggingface_weights[new_keys[i]] = weights[og_keys[i]]
-        our_model.load_state_dict(huggingface_weights)
-
-        x = torch.randn((2, 3, 224, 224))
-        out1 = from_model(x)
-        out2 = our_model(x).logits
-
-    assert torch.allclose(out1, out2), "The model logits don't match the original one."
-
-    checkpoint_name = name
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.save_pretrained(save_directory / checkpoint_name)
-        image_processor = LevitImageProcessor()
-        image_processor.save_pretrained(save_directory / checkpoint_name)
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(LevitConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_hidden_sizes = {
-        "levit-128S": 128,
-        "levit-128": 128,
-        "levit-192": 192,
-        "levit-256": 256,
-        "levit-384": 384,
-    }
-
-    names_to_config = {
-        "levit-128S": ImageNetPreTrainedConfig(
-            hidden_sizes=[128, 256, 384],
-            num_attention_heads=[4, 6, 8],
-            depths=[2, 3, 4],
-            key_dim=[16, 16, 16],
-            drop_path_rate=0,
-        ),
-        "levit-128": ImageNetPreTrainedConfig(
-            hidden_sizes=[128, 256, 384],
-            num_attention_heads=[4, 8, 12],
-            depths=[4, 4, 4],
-            key_dim=[16, 16, 16],
-            drop_path_rate=0,
-        ),
-        "levit-192": ImageNetPreTrainedConfig(
-            hidden_sizes=[192, 288, 384],
-            num_attention_heads=[3, 5, 6],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0,
-        ),
-        "levit-256": ImageNetPreTrainedConfig(
-            hidden_sizes=[256, 384, 512],
-            num_attention_heads=[4, 6, 8],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0,
-        ),
-        "levit-384": ImageNetPreTrainedConfig(
-            hidden_sizes=[384, 512, 768],
-            num_attention_heads=[6, 9, 12],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0.1,
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(
-            names_to_hidden_sizes[model_name], model_name, names_to_config[model_name], save_directory, push_to_hub
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(names_to_hidden_sizes[model_name], model_name, config, save_directory, push_to_hub)
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help="The name of the model you wish to convert, it must be one of the supported Levit* architecture,",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="levit-dump-folder/",
-        type=Path,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-    parser.add_argument(
-        "--no-push_to_hub",
-        dest="push_to_hub",
-        action="store_false",
-        help="Do not push model and image processor to the hub",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
deleted file mode 100644
index eb2862eb203d..000000000000
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import tempfile
-import warnings
-from typing import List
-
-import torch
-from tokenizers import AddedToken, processors
-
-from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
-from transformers.convert_slow_tokenizer import TikTokenConverter
-
-
-try:
-    from transformers import LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    LlamaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 1B --llama_version 3.2 --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import LlamaForCausalLM, LlamaTokenizer
-
-model = LlamaForCausalLM.from_pretrained("/output/path")
-tokenizer = LlamaTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-
-If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor:
-
-```py
-from tokenizers import processors
-bos = "<|begin_of_text|>"
-tokenizer._tokenizers.post_processor = processors.Sequence(
-    [
-        processors.ByteLevel(trim_offsets=False),
-        processors.TemplateProcessing(
-            single=f"{bos}:0 $A:0",
-            pair=f"{bos}:0 $A:0 {bos}:1 $B:1",
-            special_tokens=[
-                (bos, tokenizer.encode(bos)),
-            ],
-        ),
-    ]
-)
-```
-"""
-
-NUM_SHARDS = {
-    "1B": 1,
-    "3B": 1,
-    "7B": 1,
-    "8B": 1,
-    "8Bf": 1,
-    "7Bf": 1,
-    "13B": 2,
-    "13Bf": 2,
-    "34B": 4,
-    "30B": 4,
-    "65B": 8,
-    "70B": 8,
-    "70Bf": 8,
-    "405B": 8,
-    "405B-MP16": 16,
-}
-
-CONTEXT_LENGTH_FOR_VERSION = {"Guard-3": 131072, "3.2": 131072, "3.1": 131072, "3": 8192, "2": 4096, "1": 2048}
-
-BOS_ADDED_TOKEN = AddedToken(
-    "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-EOS_ADDED_TOKEN = AddedToken(
-    "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-EOT_ADDED_TOKEN = AddedToken(
-    "<|eot_id|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-
-DEFAULT_LLAMA_SPECIAL_TOKENS = {
-    "3": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|reserved_special_token_2|>",
-        "<|reserved_special_token_3|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|reserved_special_token_4|>",
-        "<|eot_id|>",  # end of turn
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)],
-    "3.1": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-    "3.2": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-    "Guard-3": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-}
-
-
-def is_llama_3(version):
-    return version in ["3", "3.1", "3.2", "Guard-3"]
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    model_size=None,
-    safe_serialization=True,
-    llama_version="1",
-    vocab_size=None,
-    num_shards=None,
-    instruct=False,
-    push_to_hub=False,
-):
-    print("Converting the model.")
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards
-    params = params.get("model", params)
-    n_layers = params["n_layers"]
-    n_heads = params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["dim"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    if base > 10000.0 and not is_llama_3(llama_version):
-        max_position_embeddings = 16384
-    else:
-        max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version]
-
-    if params.get("n_kv_heads", None) is not None:
-        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_key_value_heads_per_shard = num_key_value_heads // num_shards
-        key_value_dim = dims_per_head * num_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_key_value_heads_per_shard = n_heads_per_shard
-        key_value_dim = dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    with tempfile.TemporaryDirectory() as tmp_model_path:
-        print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-        # Load weights
-        if num_shards == 1:
-            # Not sharded
-            # (The sharded implementation would also work, but this is simpler.)
-            loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
-        else:
-            # Sharded
-            checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")])
-            print("Loading in order:", checkpoint_list)
-            loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list]
-        param_count = 0
-        index_dict = {"weight_map": {}}
-        for layer_i in range(n_layers):
-            filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-            if num_shards == 1:
-                # Unsharded
-                state_dict = {
-                    f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
-                    ),
-                    f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wk.weight"],
-                        n_heads=num_key_value_heads,
-                        dim1=key_value_dim,
-                    ),
-                    f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
-                    f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
-                    f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
-                    f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
-                    f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ],
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ],
-                }
-            else:
-                # Sharded
-                # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
-                # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
-                # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
-
-                state_dict = {
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ].clone(),
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ].clone(),
-                }
-                state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-                    torch.cat(
-                        [
-                            loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(
-                                n_heads_per_shard, dims_per_head, dim
-                            )
-                            for i in range(len(loaded))
-                        ],
-                        dim=0,
-                    ).reshape(dim, dim),
-                    n_heads=n_heads,
-                )
-                state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-                    torch.cat(
-                        [
-                            loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
-                                num_key_value_heads_per_shard, dims_per_head, dim
-                            )
-                            for i in range(len(loaded))
-                        ],
-                        dim=0,
-                    ).reshape(key_value_dim, dim),
-                    num_key_value_heads,
-                    key_value_dim,
-                    dim,
-                )
-                state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
-                            num_key_value_heads_per_shard, dims_per_head, dim
-                        )
-                        for i in range(len(loaded))
-                    ],
-                    dim=0,
-                ).reshape(key_value_dim, dim)
-
-                state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0
-                )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-            for k, v in state_dict.items():
-                index_dict["weight_map"][k] = filename
-                param_count += v.numel()
-            torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-        filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-        if num_shards == 1:
-            # Unsharded
-            state_dict = {
-                "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-                "model.norm.weight": loaded["norm.weight"],
-                "lm_head.weight": loaded["output.weight"],
-            }
-        else:
-            concat_dim = 0 if is_llama_3(llama_version) else 1
-            state_dict = {
-                "model.norm.weight": loaded[0]["norm.weight"],
-                "model.embed_tokens.weight": torch.cat(
-                    [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim
-                ),
-                "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0),
-            }
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-        # Write configs
-        index_dict["metadata"] = {"total_size": param_count * 2}
-        write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-        ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
-        multiple_of = params["multiple_of"] if "multiple_of" in params else 256
-
-        if is_llama_3(llama_version):
-            bos_token_id = 128000
-
-            if instruct:
-                eos_token_id = [128001, 128008, 128009]
-            else:
-                eos_token_id = 128001
-        else:
-            bos_token_id = 1
-            eos_token_id = 2
-
-        if llama_version in ["3.1", "3.2", "Guard-3"]:
-            rope_scaling = {
-                "factor": 32.0 if llama_version == "3.2" else 8.0,
-                "low_freq_factor": 1.0,
-                "high_freq_factor": 4.0,
-                "original_max_position_embeddings": 8192,
-                "rope_type": "llama3",
-            }
-        else:
-            rope_scaling = None
-
-        config = LlamaConfig(
-            hidden_size=dim,
-            intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
-            num_attention_heads=params["n_heads"],
-            num_hidden_layers=params["n_layers"],
-            rms_norm_eps=params["norm_eps"],
-            num_key_value_heads=num_key_value_heads,
-            vocab_size=vocab_size,
-            rope_theta=base,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=True if llama_version in ["3.2"] else False,
-        )
-
-        config.save_pretrained(tmp_model_path)
-
-        generation_config = GenerationConfig(
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-        )
-        generation_config.save_pretrained(tmp_model_path)
-
-        # Make space so we can load the model properly now.
-        del state_dict
-        del loaded
-        gc.collect()
-
-        print("Loading the checkpoint in a Llama model.")
-        model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
-
-        # Avoid saving this as part of the config.
-        del model.config._name_or_path
-        model.config.torch_dtype = torch.float16
-
-        print("Saving in the Transformers format.")
-        if push_to_hub:
-            print("Pushing to the hub.")
-            model.push_to_hub(model_path, safe_serialization=safe_serialization, private=True, use_temp_dir=True)
-        else:
-            print("Saving to disk.")
-            model.save_pretrained(model_path, safe_serialization=safe_serialization)
-
-
-class Llama3Converter(TikTokenConverter):
-    def __init__(self, vocab_file, special_tokens=None, instruct=False, llama_version="3.2", **kwargs):
-        super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
-        tokenizer = self.converted()
-
-        # References for chat templates in instruct models
-        templates_for_version = {
-            "2": ("meta-llama/Llama-2-7b-chat-hf", "f5db02db724555f92da89c216ac04704f23d4590"),
-            "3": ("meta-llama/Meta-Llama-3-8B-Instruct", "5f0b02c75b57c5855da9ae460ce51323ea669d8a"),
-            "3.1": ("meta-llama/Llama-3.1-8B-Instruct", "0e9e39f249a16976918f6564b8830bc894c89659"),
-            "3.2": ("meta-llama/Llama-3.2-1B-Instruct", "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"),
-            "Guard-3": ("meta-llama/Llama-Guard-3-1B", "acf7aafa60f0410f8f42b1fa35e077d705892029"),
-        }
-
-        # Add chat_template only if instruct is True.
-        # Prevents a null chat_template, which triggers
-        # a parsing warning in the Hub.
-        additional_kwargs = {}
-        if instruct or llama_version in ["Guard-3"]:
-            model_id, revision = templates_for_version.get(llama_version, (None, None))
-            if model_id is not None:
-                from transformers import AutoTokenizer
-
-                t = AutoTokenizer.from_pretrained(model_id, revision=revision)
-                additional_kwargs["chat_template"] = t.chat_template
-
-        self.converted_tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            bos_token="<|begin_of_text|>",
-            eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
-            model_input_names=["input_ids", "attention_mask"],
-            model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version],
-            clean_up_tokenization_spaces=True,
-            **additional_kwargs,
-        )
-        self.update_post_processor(self.converted_tokenizer)
-        # finer special_tokens_map.json
-        self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN
-        self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN
-
-    # We can't do this while building the tokenizer because we have no easy access to the bos token id
-    def update_post_processor(self, tokenizer):
-        tokenizer._tokenizer.post_processor = processors.Sequence(
-            [
-                processors.ByteLevel(trim_offsets=False),
-                processors.TemplateProcessing(
-                    single="<|begin_of_text|> $A",
-                    pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1",
-                    special_tokens=[
-                        ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")),
-                    ],
-                ),
-            ]
-        )
-
-
-def write_tokenizer(
-    tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False, push_to_hub=False
-):
-    print("Converting the tokenizer.")
-    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
-    if is_llama_3(llama_version):
-        tokenizer = Llama3Converter(
-            input_tokenizer_path,
-            special_tokens,
-            instruct,
-            llama_version,
-        ).converted_tokenizer
-    else:
-        try:
-            tokenizer = tokenizer_class(input_tokenizer_path)
-        except Exception:
-            raise ValueError(
-                "Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed."
-            )
-
-    if push_to_hub:
-        print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.")
-        tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
-    else:
-        print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
-        tokenizer.save_pretrained(tokenizer_path)
-    return tokenizer
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Llama weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--model_size",
-        default=None,
-        help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`."
-    )
-    # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    parser.add_argument(
-        "--llama_version",
-        choices=["1", "2", "3", "3.1", "3.2", "Guard-3"],
-        default="1",
-        type=str,
-        help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
-    )
-    parser.add_argument(
-        "--num_shards",
-        default=None,
-        type=int,
-        help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
-    )
-    parser.add_argument(
-        "--special_tokens",
-        default=None,
-        type=List[str],
-        help="The list of special tokens that should be added to the model.",
-    )
-    parser.add_argument(
-        "--instruct",
-        action="store_true",
-        default=False,
-        help="Whether the model is an instruct model or not. Will affect special tokens and chat template.",
-    )
-    args = parser.parse_args()
-    if args.model_size is None and args.num_shards is None:
-        raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`")
-    if args.special_tokens is None:
-        # no special tokens by default
-        args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS.get(str(args.llama_version), [])
-
-    spm_path = os.path.join(args.input_dir, "tokenizer.model")
-    vocab_size = len(
-        write_tokenizer(
-            args.output_dir,
-            spm_path,
-            llama_version=args.llama_version,
-            special_tokens=args.special_tokens,
-            instruct=args.instruct,
-            push_to_hub=args.push_to_hub,
-        )
-    )
-
-    if args.model_size != "tokenizer_only":
-        write_model(
-            model_path=args.output_dir,
-            input_base_path=args.input_dir,
-            model_size=args.model_size,
-            safe_serialization=args.safe_serialization,
-            llama_version=args.llama_version,
-            vocab_size=vocab_size,
-            num_shards=args.num_shards,
-            instruct=args.instruct,
-            push_to_hub=args.push_to_hub,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py
deleted file mode 100644
index 3582b9772c9c..000000000000
--- a/src/transformers/models/llava/convert_llava_weights_to_hf.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import glob
-
-import torch
-from huggingface_hub import file_exists, hf_hub_download, snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoImageProcessor,
-    AutoTokenizer,
-    LlavaConfig,
-    LlavaForConditionalGeneration,
-    LlavaProcessor,
-    SiglipVisionConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/llava/convert_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/llava-v1.5-7b-conv --old_state_dict_id liuhaotian/llava-v1.5-7b
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/llava-v1.5-7b/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
-
-    if "model.image_newline" in original_state_dict:
-        # not used in the original implementation because "merge_type=flat"
-        del original_state_dict["model.image_newline"]
-    return original_state_dict
-
-
-# used only for llava-interlave
-# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/llava-next-interleave-qwen-0.5b
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    if "Qwen" not in text_model_id:  # qwen already has a pad token
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = AutoImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    if "siglip" in vision_model_id:
-        vision_config = SiglipVisionConfig(
-            hidden_size=1152,
-            image_size=384,
-            intermediate_size=4304,
-            num_attention_heads=16,
-            num_hidden_layers=26,
-            patch_size=14,
-            vision_use_head=False,
-        ).to_dict()
-    else:
-        vision_config = None
-
-    config = LlavaConfig(
-        text_config=text_config,
-        vision_config=vision_config,
-    )
-
-    # llms-lab interleeave models do not use any selection startegy except for last hidden state
-    if "Qwen" in text_model_id:
-        config.image_token_index = 151646
-        if "siglip" in vision_model_id:
-            config.vision_feature_select_strategy = "full"
-            config.vision_feature_layer = -1
-    else:
-        config.pad_token_id = 32001
-        config.image_token_index = 32000
-
-    with torch.device("meta"):
-        model = LlavaForConditionalGeneration(config)
-
-    # Some llava variants like microsoft/llava-med-v1.5-mistral-7b use safetensors to store weights
-    if file_exists(old_state_dict_id, "model_state_dict.bin"):
-        state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")
-        state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True)
-    else:
-        state_dict = load_original_state_dict(old_state_dict_id)
-
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model and pad to 64 for performance reasons
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
deleted file mode 100644
index 06edc5c9b1ad..000000000000
--- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository.
-
-URL: https://github.com/haotian-liu/LLaVA/tree/main.
-
-
-The command used to obtain original logits is the following:
-python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0
-
-Note: logits are tested with torch==2.1.2.
-"""
-
-import argparse
-import gc
-import glob
-import json
-from pathlib import Path
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from PIL import Image
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaNextConfig,
-    LlavaNextForConditionalGeneration,
-    LlavaNextImageProcessor,
-    LlavaNextProcessor,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.float16)
-    return new_state_dict
-
-
-def load_image():
-    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    # read json
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-        text_model_id = "lmsys/vicuna-7b-v1.5"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-        text_model_id = "lmsys/vicuna-13b-v1.5"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
-        image_token_index = 64000
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-        image_token_index = 128256
-    elif model_id == "lmms-lab/llava-next-72b":
-        text_model_id = "Qwen/Qwen1.5-72B-Chat"
-        image_token_index = 151646
-    elif model_id == "lmms-lab/llava-next-110b":
-        text_model_id = "Qwen/Qwen1.5-110B-Chat"
-        image_token_index = 151646
-
-    vision_model_id = data["mm_vision_tower"]
-
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-
-    if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"):
-        # Mistral-7B doesn't have a padding token set yet
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = LlavaNextConfig(
-        text_config=text_config.to_dict(),
-        image_grid_pinpoints=image_processor.image_grid_pinpoints,
-        use_image_newline_parameter=True,
-        image_token_index=image_token_index,
-    )
-
-    with init_empty_weights():
-        model = LlavaNextForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True)
-    model.eval()
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    # Qwen-based models have extra unused space in the vocab size already, so no need to resize
-    if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]:
-        pad_shape = 64
-        vocab_size = config.text_config.vocab_size
-        if model_id == "liuhaotian/llava-v1.6-34b":
-            # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and <image>
-            num_tokens = vocab_size + 3
-        else:
-            # this one has 2 additional tokens, namely <image> and <pad>
-            num_tokens = vocab_size + 2
-        model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-        model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-            tuple(
-                (
-                    dist.sample()
-                    for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])
-                )
-            ),
-            dim=0,
-        )
-        model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-            tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-            dim=0,
-        )
-
-    print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    gc.collect()
-
-    # Load everything back for inference tests in float32 because prev script was written as that
-    # Though it's mostly loaded in fp16 as original weights are in fp16
-    model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto")
-    processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path)
-    device = model.device
-
-    # prepare inputs
-    image = load_image()
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-    elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]:
-        prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]:
-        prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-
-    inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-    # verify inputs
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset")
-        original_input_ids = torch.load(filepath, map_location="cpu")
-        # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
-        original_input_ids[original_input_ids == -200] = image_token_index
-        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
-
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        filepath = hf_hub_download(
-            repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset"
-        )
-        original_input_ids = torch.load(filepath, map_location="cpu")
-        # replace -200 by image_token_index
-        original_input_ids[original_input_ids == -200] = image_token_index
-
-        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
-
-    image_sizes = torch.tensor([[899, 1024]])
-    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
-
-    # verify single forward pass
-    print("Single forward pass")
-    with torch.inference_mode():
-        inputs = inputs.to(device)
-        outputs = model(**inputs)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-        if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-            expected_slice = torch.tensor(
-                [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-            expected_slice = torch.tensor(
-                [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-            expected_slice = torch.tensor(
-                [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-34b":
-            expected_slice = torch.tensor(
-                [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llama3-llava-next-8b":
-            expected_slice = torch.tensor(
-                [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-next-72b":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-next-110b":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]],
-                dtype=torch.float32,
-                device=device,
-            )
-        else:
-            raise ValueError(f"Model {model_id} not supported")
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Logits are ok!")
-
-    # verify generation
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=100,
-        use_cache=True,
-    )
-
-    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-    print("Generated text:", repr(generated_text))
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        expected_text = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "'
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-        expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V"""
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-        expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM"
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-"
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL'
-    elif model_id == "lmms-lab/llava-next-72b":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes"
-    elif model_id == "lmms-lab/llava-next-110b":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a"
-    else:
-        raise ValueError(f"Model {model_id} not supported")
-
-    assert generated_text == expected_text
-    print("Generated text is ok!")
-
-    # verify batched generation
-    print("Batched generation...")
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    cats_image = Image.open(requests.get(url, stream=True).raw)
-
-    inputs = processor(
-        images=[image, cats_image],
-        text=[prompt, prompt],
-        padding=True,
-        return_tensors="pt",
-    ).to(device)
-
-    for k, v in inputs.items():
-        print(k, v.shape)
-
-    print("Image sizes:", inputs.image_sizes)
-
-    # make sure image_sizes are the same
-    # as otherwise batched generation doesn't work
-    inputs.image_sizes[1] = inputs.image_sizes[0]
-
-    print("Batched generation...")
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=20,
-        use_cache=True,
-    )
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    print(outputs)
-
-    if push_to_hub:
-        checkpoint_name = model_id.split("/")[-1]
-        print(f"Pushing to repo llava-hf/{checkpoint_name}-hf")
-        model.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-        processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="liuhaotian/llava-v1.6-mistral-7b",
-        choices=[
-            "liuhaotian/llava-v1.6-mistral-7b",
-            "liuhaotian/llava-v1.6-vicuna-7b",
-            "liuhaotian/llava-v1.6-vicuna-13b",
-            "liuhaotian/llava-v1.6-34b",
-            "lmms-lab/llama3-llava-next-8b",
-            "lmms-lab/llava-next-72b",
-            "lmms-lab/llava-next-110b",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py
deleted file mode 100644
index aae44eee97a0..000000000000
--- a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-NeXT-Video checkpoints from the original repository.
-
-URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference
-"""
-
-import argparse
-import glob
-import json
-from pathlib import Path
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaNextImageProcessor,
-    LlavaNextVideoConfig,
-    LlavaNextVideoForConditionalGeneration,
-    LlavaNextVideoImageProcessor,
-    LlavaNextVideoProcessor,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-# {{SYSTEM_PROMPT}} USER: <image>\n{{PROMPT}} ASSISTANT:" assistant end with "</s> "
-chat_vicuna = (
-    "{% for message in messages %}"
-    "{% if message['role'] == 'system' %}"
-    "{{ message['content'][0]['text'] }}"
-    "{% else %}"
-    "{{ message['role'].upper() + ': '}}"
-    "{% endif %}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] + ' '}}"
-    "{% endfor %}"
-    "{% endfor %}"
-    "{% if add_generation_prompt %}"
-    "{{ 'ASSISTANT:' }}"
-    "{% endif %}"
-)
-
-# "[INST] <image>\nWhat is shown in this image? [/INST]" assistant end with "</s> "
-chat_mistral = (
-    "{% for message in messages %}"
-    "{% if message['role'] == 'user' %}"
-    "{{ '[INST] ' }}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] }}"
-    "{% endfor %}"
-    "{{' [/INST]' }}"
-    "{% elif message['role'] == 'assistant' %}"
-    r"{{ ' ' + message['content'][0]['text'] + '<\s> '}}"
-    "{% else %}"
-    "{{ raise_exception('Only user and assistant roles are supported!') }}"
-    "{% endif %}"
-    "{% endfor %}"
-)
-
-# "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-chat_yi = (
-    "{% for message in messages %}"
-    "{{'<|im_start|>' + message['role'] + '\n'}}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] }}"
-    "{% endfor %}"
-    "{{'<|im_end|>' + '\n'}}"
-    "{% endfor %}"
-    "{% if add_generation_prompt %}"
-    "{{ '<|im_start|>assistant\n' }}"
-    "{% endif %}"
-)
-
-model2template = {
-    "lmms-lab/LLaVA-NeXT-Video-7B-32K": chat_mistral,
-    "lmms-lab/LLaVA-NeXT-Video-7B": chat_vicuna,
-    "lmms-lab/LLaVA-NeXT-Video-7B-DPO": chat_vicuna,
-    "lmms-lab/LLaVA-NeXT-Video-34B": chat_yi,
-    "lmms-lab/LLaVA-NeXT-Video-34B-DPO": chat_yi,
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.bfloat16)
-    return new_state_dict
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id == "lmms-lab/LLaVA-NeXT-Video-7B-32K":
-        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-        video_token_index = 32000
-        image_token_index = 32001
-        overwrite_text_config = {}
-    elif model_id in ["lmms-lab/LLaVA-NeXT-Video-7B", "lmms-lab/LLaVA-NeXT-Video-7B-DPO"]:
-        text_model_id = "lmsys/vicuna-7b-v1.5"
-        video_token_index = 32000
-        image_token_index = 32001
-        overwrite_text_config = {"factor": 2.0, "type": "linear"}
-    elif model_id in ["lmms-lab/LLaVA-NeXT-Video-34B", "lmms-lab/LLaVA-NeXT-Video-34B-DPO"]:
-        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
-        video_token_index = 64000
-        image_token_index = 64001
-        overwrite_text_config = {}
-    else:
-        raise ValueError("Incorrect checkpoint referenced. Text model-id not identified!")
-
-    vision_model_id = data["mm_vision_tower"]
-
-    torch.set_default_dtype(torch.bfloat16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-    text_config = text_config.to_dict()
-    text_config.update(overwrite_text_config)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True, padding_side="left")
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-
-    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
-    video_processor = LlavaNextVideoImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaNextVideoProcessor(
-        tokenizer=tokenizer,
-        video_processor=video_processor,
-        image_processor=image_processor,
-        chat_template=model2template[model_id],
-    )
-
-    config = LlavaNextVideoConfig(
-        text_config=text_config,
-        image_grid_pinpoints=image_processor.image_grid_pinpoints,
-        use_image_newline_parameter=True,
-        video_token_index=video_token_index,
-        image_token_index=image_token_index,
-    )
-
-    with init_empty_weights():
-        model = LlavaNextVideoForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True, strict=True)
-
-    # See https://nlp.stanford.edu/~johnhew/vocab-expansion.html for why we get mean/stdev this way to expand embeddings
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-
-    # this one has 2 additional tokens, namely <image>, <video> and <pad>
-    num_tokens = vocab_size + 3
-    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        repo_id = model_id.split("/")[-1]
-        print(f"Pushing model to hub repo: {repo_id}")
-        model.push_to_hub(f"llava-hf/{repo_id}-hf")
-        processor.push_to_hub(f"llava-hf/{repo_id}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="lmms-lab/LLaVA-NeXT-Video-7B",
-        choices=[
-            "lmms-lab/LLaVA-NeXT-Video-7B",
-            "lmms-lab/LLaVA-NeXT-Video-7B-DPO",
-            "lmms-lab/LLaVA-NeXT-Video-7B-32K",
-            "lmms-lab/LLaVA-NeXT-Video-34B",
-            "lmms-lab/LLaVA-NeXT-Video-34B-DPO",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
deleted file mode 100644
index 65c57f624f54..000000000000
--- a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-Onevision checkpoints from the original repository.
-
-URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main
-
-"""
-
-import argparse
-import gc
-import glob
-import json
-from pathlib import Path
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from PIL import Image
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaOnevisionConfig,
-    LlavaOnevisionForConditionalGeneration,
-    LlavaOnevisionImageProcessor,
-    LlavaOnevisionProcessor,
-    LlavaOnevisionVideoProcessor,
-    SiglipVisionConfig,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.float16)
-    return new_state_dict
-
-
-def load_image():
-    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    # read json
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id in ["lmms-lab/llava-onevision-qwen2-0.5b-ov", "lmms-lab/llava-onevision-qwen2-0.5b-si"]:
-        text_model_id = "Qwen/Qwen2-0.5B-Instruct"
-    elif model_id in [
-        "lmms-lab/llava-onevision-qwen2-7b-ov",
-        "lmms-lab/llava-onevision-qwen2-7b-si",
-        "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
-    ]:
-        text_model_id = "Qwen/Qwen2-7B-Instruct"
-    elif model_id in [
-        "lmms-lab/llava-onevision-qwen2-72b-ov",
-        "lmms-lab/llava-onevision-qwen2-72b-si",
-        "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
-    ]:
-        text_model_id = "Qwen/Qwen2-72B-Instruct"
-
-    vision_model_id = data["mm_vision_tower"]
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-
-    image_processor = LlavaOnevisionImageProcessor.from_pretrained(vision_model_id)
-    video_processor = LlavaOnevisionVideoProcessor.from_pretrained(vision_model_id)
-    processor = LlavaOnevisionProcessor(
-        tokenizer=tokenizer,
-        video_processor=video_processor,
-        image_processor=image_processor,
-        num_image_tokens=729,
-        vision_feature_select_strategy="full",
-        chat_template=chat_template,
-    )
-
-    vision_config = SiglipVisionConfig(
-        hidden_size=1152,
-        image_size=384,
-        intermediate_size=4304,
-        num_attention_heads=16,
-        num_hidden_layers=26,  # drop the last layer
-        patch_size=14,
-        vision_use_head=False,  # no head
-    ).to_dict()
-
-    config = LlavaOnevisionConfig(
-        text_config=text_config.to_dict(),
-        vision_config=vision_config,
-        use_image_newline_parameter=True,
-    )
-
-    with init_empty_weights():
-        model = LlavaOnevisionForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True)
-    model.eval()
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    # Qwen-based models have extra unused space in the vocab size already, so no need to resize
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-    num_tokens = vocab_size + 2
-    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    gc.collect()
-
-    # Load everything back for inference tests in float32 because prev script was written as that
-    # Though it's mostly loaded in fp16 as original weights are in fp16
-    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-        pytorch_dump_folder_path, torch_dtype="float16", device_map="auto"
-    )
-    processor = LlavaOnevisionProcessor.from_pretrained(pytorch_dump_folder_path)
-    device = model.device
-
-    # prepare inputs
-    image = load_image()
-    prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch.float16)
-
-    # verify inputs
-    filepath = hf_hub_download(
-        repo_id="RaushanTurganbay/test-image", filename="llava_onevision_pixel_values.pt", repo_type="dataset"
-    )
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
-
-    image_sizes = torch.tensor([[899, 1024]])
-    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
-
-    # verify single forward pass
-    print("Single forward pass")
-    with torch.inference_mode():
-        inputs = inputs.to(device)
-        outputs = model(**inputs)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-        if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-12.1953, -14.6797, -12.7891], [0.5840, -0.8467, 1.3799], [3.6055, 4.5430, 9.9062]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-12.0234, -14.3828, -12.7500], [2.3594, 1.0000, 3.9336], [3.6582, 4.7148, 9.1172]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.7656, 3.3418, 1.4033], [0.0757, 0.7427, 3.5098], [6.7109, 5.6797, 9.3828]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.8496, 3.4219, 1.3135], [3.0996, 3.0117, 3.1484], [4.2422, 4.7109, 9.9688]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.1875, 4.4883, 2.7910], [1.2949, 5.1328, 3.1582], [0.9390, 6.4531, 8.4375]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.2930, 4.7305, 2.7363], [1.7529, 5.0742, 3.9590], [1.3936, 6.3438, 9.3984]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.8662, 3.4316, 1.3174], [2.7109, 2.5488, 3.0117], [4.4648, 4.9648, 10.3359]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.3086, 4.7344, 2.6953], [1.7090, 5.1719, 4.0234], [1.3057, 6.3438, 9.5469]],
-                dtype=torch.float32,
-                device=device,
-            )
-        else:
-            raise ValueError(f"Model {model_id} not supported")
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Logits are ok!")
-
-    # verify generation
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=100,
-        use_cache=True,
-    )
-
-    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-    print("Generated text:", repr(generated_text))
-
-    if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that shows the performance of different algorithms or models in a specific domain, such as image classification or natural language processing. The chart is color-coded to represent different algorithms, with each color corresponding to a specific algorithm. The algorithms are labeled as BLIP-2, InstructBLIP, Owen-VL-Chat, and LLaVA-1.5. The chart also includes a legend at the bottom that explains the color coding and the algorithms represented."
-    elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into different categories, each represented by a different color and labeled with the name of the model or technique used. The models are evaluated based on their performance metrics, such as BLEU-2, InstructBLIP, Qwen-VL-Chat, and LLaVA-1.5. The radar chart helps to visualize the relative"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThis image is a radar chart that compares the performance of different models on various metrics. The models being compared are BLIP-2, InstructBLIP, and Qwen-VL-Chat. The metrics being compared are VQA, QA, GQA, VQA-av2, and VQA-av2. The chart shows that BLIP-2 performs the best on all metrics, followed by InstructBLIP and Qwen-VL-Chat."
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to compare multiple quantitative variables. Each axis represents a different variable, and the chart is filled with data points that represent the performance or values of different entities across these variables.\n\nIn this particular radar chart, the variables are represented on the axes, and the performance of different models or systems is shown by the lines connecting the data points. The models or systems are labeled along the bottom of the chart,"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. The chart is used to compare the performance of different models or systems across various benchmarks or metrics.\n\nIn this specific radar chart, there are multiple axes, each representing a different benchmark or metric, such as VQA2, GQA, TextVQA, and others. The chart includes several colored lines"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along these axes.\n\nIn this particular radar chart, there are multiple lines representing different models or systems, each distinguished by a different color and labeled with a name such as BLIP-2, In"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
-    else:
-        raise ValueError(f"Model {model_id} not supported")
-
-    assert generated_text == expected_text
-    print("Generated text is ok!")
-
-    # verify batched generation
-    print("Batched generation...")
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    cats_image = Image.open(requests.get(url, stream=True).raw)
-
-    inputs = processor(
-        images=[image, cats_image],
-        text=[prompt, prompt],
-        padding=True,
-        return_tensors="pt",
-    ).to(device, torch.float16)
-
-    for k, v in inputs.items():
-        print(k, v.shape)
-
-    print("Image sizes:", inputs.image_sizes)
-
-    # make sure image_sizes are the same
-    # as otherwise batched generation doesn't work
-    inputs.image_sizes[1] = inputs.image_sizes[0]
-
-    print("Batched generation...")
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=20,
-        use_cache=True,
-    )
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    print(outputs)
-
-    if push_to_hub:
-        checkpoint_name = model_id.split("/")[-1]
-        print(f"Pushing to repo llava-hf/{checkpoint_name}-hf")
-        model.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-        processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="lmms-lab/llava-onevision-qwen2-0.5b-ov",
-        choices=[
-            "lmms-lab/llava-onevision-qwen2-0.5b-ov",
-            "lmms-lab/llava-onevision-qwen2-0.5b-si",
-            "lmms-lab/llava-onevision-qwen2-7b-si",
-            "lmms-lab/llava-onevision-qwen2-7b-ov",
-            "lmms-lab/llava-onevision-qwen2-72b-si",
-            "lmms-lab/llava-onevision-qwen2-72b-ov",
-            "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
-            "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
deleted file mode 100644
index 4ef2131228b6..000000000000
--- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-
-import pytorch_lightning as pl
-import torch
-from torch import nn
-
-from transformers import LongformerForQuestionAnswering, LongformerModel
-
-
-class LightningModel(pl.LightningModule):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-        self.num_labels = 2
-        self.qa_outputs = nn.Linear(self.model.config.hidden_size, self.num_labels)
-
-    # implement only because lightning requires to do so
-    def forward(self):
-        pass
-
-
-def convert_longformer_qa_checkpoint_to_pytorch(
-    longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
-):
-    # load longformer model from model identifier
-    longformer = LongformerModel.from_pretrained(longformer_model)
-    lightning_model = LightningModel(longformer)
-
-    ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"))
-    lightning_model.load_state_dict(ckpt["state_dict"])
-
-    # init longformer question answering model
-    longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model)
-
-    # transfer weights
-    longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict())
-    longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict())
-    longformer_for_qa.eval()
-
-    # save model
-    longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Conversion successful. Model saved under {pytorch_dump_folder_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--longformer_model",
-        default=None,
-        type=str,
-        required=True,
-        help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.",
-    )
-    parser.add_argument(
-        "--longformer_question_answering_ckpt_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch Lightning Checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_longformer_qa_checkpoint_to_pytorch(
-        args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
deleted file mode 100644
index cf5c2d52d8ea..000000000000
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert T5/LongT5X checkpoints from the original repository to JAX/FLAX model. This script is an extension of
-'src/transformers/models/t5/convert_t5x_checkpoint_to_flax.
-"""
-
-import argparse
-
-from t5x import checkpoints
-
-from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM
-
-
-def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path):
-    config = AutoConfig.from_pretrained(config_name)
-    flax_model = FlaxAutoModelForSeq2SeqLM.from_config(config=config)
-    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-
-    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]
-
-    if config.model_type == "t5":
-        encoder_attn_name = "SelfAttention"
-    if config.model_type == "longt5" and config.encoder_attention_type == "local":
-        encoder_attn_name = "LocalSelfAttention"
-    elif config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-        encoder_attn_name = "TransientGlobalSelfAttention"
-    else:
-        raise ValueError(
-            "Given config is expected to have `model_type='t5'`, or `model_type='longt5` with `encoder_attention_type`"
-            " attribute with a value from ['local', 'transient-global]."
-        )
-
-    # Encoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"]
-
-        # Global input layer norm
-        if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-            t5x_global_layer_norm = t5x_model["target"]["encoder"][layer_name]["attention"]["T5LayerNorm_0"]["scale"]
-
-        # Layer Normalization
-        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"]
-
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model_encoder_layer_block = flax_model.params["encoder"]["block"][str(layer_index)]["layer"]
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["k"]["kernel"] = t5x_attention_key
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["o"]["kernel"] = t5x_attention_out
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["q"]["kernel"] = t5x_attention_query
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["v"]["kernel"] = t5x_attention_value
-
-        flax_model_encoder_layer_block["0"]["layer_norm"]["weight"] = t5x_attention_layer_norm
-
-        # Global input layer norm
-        if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-            flax_model_encoder_layer_block["0"][encoder_attn_name]["global_input_layer_norm"]["weight"] = (
-                t5x_global_layer_norm
-            )
-
-        if split_mlp_wi:
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
-        else:
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi
-
-        flax_model_encoder_layer_block["1"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
-        flax_model_encoder_layer_block["1"]["layer_norm"]["weight"] = t5x_mlp_layer_norm
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"] = flax_model_encoder_layer_block
-
-    # Only for layer 0:
-    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_encoder_rel_embedding
-
-    # Side/global relative position_bias + layer norm
-    if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-        t5x_encoder_global_rel_embedding = t5x_model["target"]["encoder"]["side_relpos_bias"]["rel_embedding"].T
-        flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["global_relative_attention_bias"][
-            "embedding"
-        ] = t5x_encoder_global_rel_embedding
-
-    # Assigning
-    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
-    flax_model.params["encoder"]["final_layer_norm"]["weight"] = t5x_encoder_norm
-
-    # Decoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"][
-            "scale"
-        ]
-
-        # Encoder-Decoder-Attention
-        t5x_enc_dec_attention_module = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]
-        t5x_enc_dec_attention_key = t5x_enc_dec_attention_module["key"]["kernel"]
-        t5x_enc_dec_attention_out = t5x_enc_dec_attention_module["out"]["kernel"]
-        t5x_enc_dec_attention_query = t5x_enc_dec_attention_module["query"]["kernel"]
-        t5x_enc_dec_attention_value = t5x_enc_dec_attention_module["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"]
-
-        # MLP
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model_decoder_layer_block = flax_model.params["decoder"]["block"][str(layer_index)]["layer"]
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value
-
-        flax_model_decoder_layer_block["0"]["layer_norm"]["weight"] = t5x_pre_attention_layer_norm
-
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value
-
-        flax_model_decoder_layer_block["1"]["layer_norm"]["weight"] = t5x_cross_layer_norm
-
-        if split_mlp_wi:
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
-        else:
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi
-
-        flax_model_decoder_layer_block["2"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
-
-        flax_model_decoder_layer_block["2"]["layer_norm"]["weight"] = tx5_mlp_layer_norm
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"] = flax_model_decoder_layer_block
-
-    # Decoder Normalization
-    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
-    flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
-
-    # Only for layer 0:
-    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_decoder_rel_embedding
-
-    # Token Embeddings
-    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
-    flax_model.params["shared"]["embedding"] = tx5_token_embeddings
-
-    # LM Head (only in v1.1 and LongT5 checkpoints)
-    if "logits_dense" in t5x_model["target"]["decoder"]:
-        flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"]
-
-    flax_model.save_pretrained(flax_dump_folder_path)
-    print("T5X Model was sucessfully converted!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path the T5X checkpoint."
-    )
-    parser.add_argument("--config_name", default=None, type=str, required=True, help="Config name of LongT5/T5 model.")
-    parser.add_argument(
-        "--flax_dump_folder_path", default=None, type=str, required=True, help="Path to the output FLAX model."
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
diff --git a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c86fa6e30890..000000000000
--- a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LUKE checkpoint."""
-
-import argparse
-import json
-import os
-
-import torch
-
-from transformers import LukeConfig, LukeModel, LukeTokenizer, RobertaTokenizer
-from transformers.tokenization_utils_base import AddedToken
-
-
-@torch.no_grad()
-def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
-    # Load configuration defined in the metadata file
-    with open(metadata_path) as metadata_file:
-        metadata = json.load(metadata_file)
-    config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])
-
-    # Load in the weights from the checkpoint_path
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # Load the entity vocab file
-    entity_vocab = load_entity_vocab(entity_vocab_path)
-
-    tokenizer = RobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])
-
-    # Add special tokens to the token vocabulary for downstream tasks
-    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
-    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
-    config.vocab_size += 2
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-    with open(os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
-        json.dump(entity_vocab, f)
-
-    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-
-    # Initialize the embeddings of the special tokens
-    word_emb = state_dict["embeddings.word_embeddings.weight"]
-    ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
-    ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
-    state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])
-
-    # Initialize the query layers of the entity-aware self-attention mechanism
-    for layer_index in range(config.num_hidden_layers):
-        for matrix_name in ["query.weight", "query.bias"]:
-            prefix = f"encoder.layer.{layer_index}.attention.self."
-            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]
-
-    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
-    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
-    entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]
-
-    model = LukeModel(config=config).eval()
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"):
-        raise ValueError(f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids")
-    if not (all(key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)):
-        raise ValueError(
-            "Unexpected keys"
-            f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
-        )
-
-    # Check outputs
-    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")
-
-    text = (
-        "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the"
-        " new world number one avoid a humiliating second- round exit at Wimbledon ."
-    )
-    span = (39, 42)
-    encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    # Verify word hidden states
-    if model_size == "large":
-        expected_shape = torch.Size((1, 42, 1024))
-        expected_slice = torch.tensor(
-            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
-        )
-    else:  # base
-        expected_shape = torch.Size((1, 42, 768))
-        expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]])
-
-    if not (outputs.last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
-        )
-    if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify entity hidden states
-    if model_size == "large":
-        expected_shape = torch.Size((1, 1, 1024))
-        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
-    else:  # base
-        expected_shape = torch.Size((1, 1, 768))
-        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])
-
-    if not (outputs.entity_last_hidden_state.shape != expected_shape):
-        raise ValueError(
-            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
-            f" {expected_shape}"
-        )
-    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Finally, save our PyTorch model and tokenizer
-    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_entity_vocab(entity_vocab_path):
-    entity_vocab = {}
-    with open(entity_vocab_path, "r", encoding="utf-8") as f:
-        for index, line in enumerate(f):
-            title, _ = line.rstrip().split("\t")
-            entity_vocab[title] = index
-
-    return entity_vocab
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
-    parser.add_argument(
-        "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
-    )
-    parser.add_argument(
-        "--entity_vocab_path",
-        default=None,
-        type=str,
-        help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
-    )
-    parser.add_argument(
-        "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
-    )
-    args = parser.parse_args()
-    convert_luke_checkpoint(
-        args.checkpoint_path,
-        args.metadata_path,
-        args.entity_vocab_path,
-        args.pytorch_dump_folder_path,
-        args.model_size,
-    )
diff --git a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 1dd77bc36f80..000000000000
--- a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LXMERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = LxmertConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = LxmertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py b/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 97265fbdcf93..000000000000
--- a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import M2M100Config, M2M100ForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path):
-    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
-    args = m2m_100["args"] or m2m_100["cfg"]["model"]
-    state_dict = m2m_100["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    config = M2M100Config(
-        vocab_size=vocab_size,
-        max_position_embeddings=1024,
-        encoder_layers=args.encoder_layers,
-        decoder_layers=args.decoder_layers,
-        encoder_attention_heads=args.encoder_attention_heads,
-        decoder_attention_heads=args.decoder_attention_heads,
-        encoder_ffn_dim=args.encoder_ffn_embed_dim,
-        decoder_ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.encoder_embed_dim,
-        encoder_layerdrop=args.encoder_layerdrop,
-        decoder_layerdrop=args.decoder_layerdrop,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="relu",
-    )
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    model = M2M100ForConditionalGeneration(config)
-    model.model.load_state_dict(state_dict, strict=False)
-    model.lm_head = make_linear_from_emb(model.model.shared)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_pathß)
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
deleted file mode 100644
index 0cf7dcc0edaf..000000000000
--- a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# coding=utf-8
-# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba_ssm` package to be installed."""
-
-import argparse
-import json
-import math
-from typing import Tuple
-
-import torch
-
-from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM
-from transformers.utils import logging
-from transformers.utils.import_utils import is_mamba_ssm_available
-
-
-if is_mamba_ssm_available():
-    from mamba_ssm.models.config_mamba import MambaConfig as MambaConfigSSM
-    from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
-
-    def convert_ssm_config_to_hf_config(config_ssm: MambaConfigSSM) -> MambaConfig:
-        """Convert a MambaConfig from mamba_ssm to a MambaConfig from transformers."""
-        hf_config = MambaConfig()
-        # Set config hidden size, num hidden layers, and vocab size directly from the original config
-        hf_config.hidden_size = config_ssm.d_model
-        hf_config.intermediate_size = config_ssm.d_model * 2
-        hf_config.time_step_rank = math.ceil(config_ssm.d_model / 16)
-
-        hf_config.num_hidden_layers = config_ssm.n_layer
-        vocab_size = config_ssm.vocab_size
-        pad_vocab_size_multiple = config_ssm.pad_vocab_size_multiple
-        if (vocab_size % pad_vocab_size_multiple) != 0:
-            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-        hf_config.vocab_size = vocab_size
-        return hf_config
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_mamba_ssm_checkpoint_to_huggingface_model(
-    original_state_dict: dict, original_ssm_config_dict: dict
-) -> Tuple[MambaForCausalLM, AutoTokenizer]:
-    if not is_mamba_ssm_available():
-        raise ImportError(
-            "Calling convert_mamba_ssm_checkpoint_to_huggingface_model requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`."
-        )
-    original_ssm_config = MambaConfigSSM(**original_ssm_config_dict)
-
-    # Convert mamba_ssm config to huggingface MambaConfig
-    hf_config = convert_ssm_config_to_hf_config(original_ssm_config)
-
-    # No weights need to be renamed between the two models.
-    converted_state_dict = original_state_dict
-
-    # Load reshaped state dict into a huggingface model.
-    hf_model = MambaForCausalLM(hf_config)
-    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-    hf_model.load_state_dict(converted_state_dict)
-    return (hf_model, tokenizer)
-
-
-def validate_converted_model(
-    original_state_dict: dict, original_ssm_config_dict: dict, hf_model: MambaForCausalLM, tokenizer: AutoTokenizer
-) -> None:
-    """Validate the converted model returns the same output as the original model."""
-    torch_device = "cuda"
-
-    original_config = MambaConfigSSM(**original_ssm_config_dict)
-    original_model = MambaLMHeadModel(original_config).to(torch_device)
-    original_model.load_state_dict(original_state_dict)
-
-    hf_model = hf_model.to(torch_device)
-    input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"].to(torch_device)
-    # Assert model logits are close
-    with torch.no_grad():
-        original_model_logits = original_model(input_ids).logits
-        hf_model_logits = hf_model(input_ids).logits
-    if not torch.allclose(original_model_logits, hf_model_logits, atol=1e-3):
-        raise ValueError("The converted model did not return the same logits as the original model.")
-
-    logger.info("Model conversion validated successfully.")
-
-
-def convert_mamba_checkpoint_file_to_huggingface_model_file(
-    mamba_checkpoint_path: str, config_json_file: str, output_dir: str
-) -> None:
-    if not is_mamba_ssm_available():
-        raise ImportError(
-            "Calling convert_mamba_checkpoint_file_to_huggingface_model_file requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`."
-        )
-    if not torch.cuda.is_available():
-        raise ValueError(
-            "This script is to be run with a CUDA device, as the original mamba_ssm model does not support cpu."
-        )
-    logger.info(f"Loading model from {mamba_checkpoint_path} based on config from {config_json_file}")
-    # Load weights and config from paths
-    original_state_dict = torch.load(mamba_checkpoint_path, map_location="cpu")
-    with open(config_json_file, "r", encoding="utf-8") as json_file:
-        original_ssm_config_dict = json.load(json_file)
-
-    # Convert the model
-    hf_model, tokenizer = convert_mamba_ssm_checkpoint_to_huggingface_model(
-        original_state_dict, original_ssm_config_dict
-    )
-
-    # Validate the conversion
-    validate_converted_model(original_state_dict, original_ssm_config_dict, hf_model, tokenizer)
-
-    logger.info(f"Model converted successfully. Saving model to {output_dir}")
-
-    # Save new model to pytorch_dump_path
-    hf_model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba_checkpoint_file",
-        type=str,
-        required=True,
-        help="Path to a `pytorch_model.bin` mamba_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-c",
-        "--config_json_file",
-        type=str,
-        required=True,
-        help="Path to a `config.json` file corresponding to a MambaConfig of the original mamba_ssm model.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    args = parser.parse_args()
-
-    convert_mamba_checkpoint_file_to_huggingface_model_file(
-        args.mamba_checkpoint_file, args.config_json_file, args.output_dir
-    )
diff --git a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
deleted file mode 100644
index f68e9bd4904b..000000000000
--- a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# coding=utf-8
-# Copyright 2024 state-spaces/mamba2 org and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba2_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed."""
-
-import argparse
-import json
-from functools import partial
-from os import path
-from typing import Dict, Optional
-
-import torch
-from safetensors import safe_open
-from safetensors.torch import save_model
-
-from transformers import GPTNeoXTokenizerFast, LlamaTokenizerFast, Mamba2Config, Mamba2ForCausalLM
-
-
-def load_state_dict_from_safetensors(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
-    # Load weights and config from paths
-    original_state_dict = {}
-    with safe_open(path.join(mamba2_checkpoint_path, ckpt_name), framework="pt") as f:
-        for k in f.keys():
-            newk = k.removeprefix("model.")
-            original_state_dict[newk] = f.get_tensor(k).clone()
-    return original_state_dict
-
-
-def load_state_dict_from_torch(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
-    return torch.load(path.join(mamba2_checkpoint_path, ckpt_name), map_location="cpu")
-
-
-def convert_ssm_config_to_hf_config(config_ssm: Dict, mamba2_model_dict: Dict) -> Mamba2Config:
-    """Convert a Mamba2Config from mamba_ssm to a Mamba2Config from here."""
-    hf_config = Mamba2Config()
-
-    # Switch to a different dict depending on model type
-    config_dict = mamba2_model_dict
-
-    # Set important values from config and recalculate other resulting entries
-    hf_config.hidden_size = config_ssm[config_dict["hidden_size"]]
-    hf_config.num_heads = (hf_config.hidden_size * hf_config.expand) // hf_config.head_dim
-    hf_config.num_hidden_layers = config_ssm[config_dict["num_hidden_layers"]]
-    hf_config.n_groups = config_ssm.get(config_dict["n_groups"], 1)
-    hf_config.tie_word_embeddings = config_ssm["tie_embeddings"]
-    hf_config.bos_token_id = config_dict["bos_token_id"]
-    hf_config.pad_token_id = config_dict["pad_token_id"]
-    hf_config.eos_token_id = config_dict["eos_token_id"]
-
-    # Padded vocab size, mostly of 16 but 32 is also very common in different models
-    vocab_size = config_ssm["vocab_size"]
-    pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"]
-    if (vocab_size % pad_vocab_size_multiple) != 0:
-        vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-    hf_config.vocab_size = vocab_size
-
-    return hf_config
-
-
-def load_and_save_tokenizer(
-    mamba2_model_type: str,
-    output_dir: str,
-    tokenizer_model_path: Optional[str] = None,
-) -> None:
-    tokenizer = None
-
-    # Load tokenizer
-    if tokenizer_model_path is not None and mamba2_model_type == "codestral":
-        tokenizer_class = LlamaTokenizerFast
-        tokenizer = tokenizer_class(tokenizer_model_path, legacy=False, from_slow=True)
-    elif mamba2_model_type == "mamba_ssm":
-        tokenizer = GPTNeoXTokenizerFast.from_pretrained("state-spaces/mamba-130m-hf", padding_side="left")
-
-    # Save tokenizer
-    if tokenizer is not None:
-        tokenizer.save_pretrained(output_dir)
-
-
-_MAMBA2_MODELS_DICT = {
-    "codestral": {
-        "hidden_size": "dim",
-        "num_hidden_layers": "n_layers",
-        "n_groups": "n_groups",
-        "bos_token_id": 0,
-        "pad_token_id": 1,
-        "eos_token_id": 2,
-        "config_name": "params.json",
-        "load_state_dict": partial(load_state_dict_from_safetensors, ckpt_name="consolidated.safetensors"),
-        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "codestral"),
-    },
-    "mamba_ssm": {
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layer",
-        "n_groups": "ngroups",
-        "bos_token_id": 0,
-        "pad_token_id": 0,
-        "eos_token_id": 0,
-        "config_name": "config.json",
-        "load_state_dict": partial(load_state_dict_from_torch, ckpt_name="pytorch_model.bin"),
-        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "mamba_ssm"),
-    },
-}
-
-
-def convert_mamba2_checkpoint_file_to_huggingface_model_file(
-    mamba2_checkpoint_path: str,
-    mamba2_model_type: str,
-    precision: str,
-    output_dir: str,
-    tokenizer_model_path: Optional[str] = None,
-) -> None:
-    mamba2_model_dict = _MAMBA2_MODELS_DICT[mamba2_model_type]
-
-    # Load and save config based on name
-    config_path = path.join(mamba2_checkpoint_path, mamba2_model_dict["config_name"])
-    with open(config_path, "r", encoding="utf-8") as json_file:
-        config = json.load(json_file)
-    hf_config = convert_ssm_config_to_hf_config(config_ssm=config, mamba2_model_dict=mamba2_model_dict)
-    hf_config.save_pretrained(output_dir)
-
-    # Load state dict of the original model and transfer to hf model
-    original_state_dict = mamba2_model_dict["load_state_dict"](mamba2_checkpoint_path=mamba2_checkpoint_path)
-    hf_model = Mamba2ForCausalLM(hf_config)
-    hf_model.load_state_dict(original_state_dict)
-
-    # Save new model to pytorch_dump_path
-    dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16)
-    save_model(hf_model.to(dtype), path.join(output_dir, "model.safetensors"), metadata={"format": "pt"})
-
-    # Load and save tokenizer
-    mamba2_model_dict["load_and_save_tokenizer"](output_dir=output_dir, tokenizer_model_path=tokenizer_model_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba2_checkpoint_directory",
-        type=str,
-        required=True,
-        help="Path to a directory containing the `pytorch_model.bin` or `.safetensors` mamba2_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-m",
-        "--mamba2_model_type",
-        type=str,
-        default="mamba_ssm",
-        const="mamba_ssm",
-        required=True,
-        choices=("codestral", "mamba_ssm"),
-        help="The model type the conversion will be performed on. Can choose from either `codestral` or `mamba_ssm`.",
-    )
-    parser.add_argument(
-        "-p",
-        "--precision",
-        type=str,
-        default="fp16",
-        const="fp16",
-        required=True,
-        choices=("fp32", "fp16", "bf16"),
-        help="The precision the model will be saved in. Select from fp32, fp16 or bf16.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    parser.add_argument(
-        "-t",
-        "--tokenizer_model_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to a `codestral` tokenizer file.",
-    )
-    args = parser.parse_args()
-
-    convert_mamba2_checkpoint_file_to_huggingface_model_file(
-        args.mamba2_checkpoint_directory,
-        args.mamba2_model_type,
-        args.precision,
-        args.output_dir,
-        args.tokenizer_model_path,
-    )
diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
deleted file mode 100644
index 40ad3294097c..000000000000
--- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+++ /dev/null
@@ -1,1327 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import datetime
-import json
-import os
-import re
-from pathlib import Path
-from typing import Tuple
-
-import yaml
-from tqdm import tqdm
-
-from transformers.models.marian.convert_marian_to_pytorch import (
-    FRONT_MATTER_TEMPLATE,
-    convert,
-    convert_opus_name_to_hf_name,
-    download_and_unzip,
-    get_system_metadata,
-)
-
-
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
-ISO_PATH = "lang_code_data/iso-639-3.csv"
-LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
-TATOEBA_MODELS_URL = "https://object.pouta.csc.fi/Tatoeba-MT-models"
-
-
-class TatoebaConverter:
-    """
-    Convert Tatoeba-Challenge models to huggingface format.
-
-    Steps:
-
-        1. Convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
-        2. Rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
-           one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
-        3. Select the best model for a particular pair, parse the yml for it and write a model card. By default the
-           best model is the one listed first in released-model-results, but it's also possible to specify the most
-           recent one.
-    """
-
-    def __init__(self, save_dir="marian_converted"):
-        assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
-        self.download_lang_info()
-        self.model_results = json.load(open("Tatoeba-Challenge/models/released-model-results.json"))
-        self.alpha3_to_alpha2 = {}
-        for line in open(ISO_PATH):
-            parts = line.split("\t")
-            if len(parts[0]) == 3 and len(parts[3]) == 2:
-                self.alpha3_to_alpha2[parts[0]] = parts[3]
-        for line in LANG_CODE_PATH:
-            parts = line.split(",")
-            if len(parts[0]) == 3 and len(parts[1]) == 2:
-                self.alpha3_to_alpha2[parts[0]] = parts[1]
-        self.model_card_dir = Path(save_dir)
-        self.tag2name = {}
-        for key, value in GROUP_MEMBERS.items():
-            self.tag2name[key] = value[0]
-
-    def convert_models(self, tatoeba_ids, dry_run=False):
-        models_to_convert = [self.parse_metadata(x) for x in tatoeba_ids]
-        save_dir = Path("marian_ckpt")
-        dest_dir = Path(self.model_card_dir)
-        dest_dir.mkdir(exist_ok=True)
-        for model in tqdm(models_to_convert):  # k, prepro, download, test_set_url in tqdm(model_list):
-            if "SentencePiece" not in model["pre-processing"]:
-                print(f"Skipping {model['release']} because it doesn't appear to use SentencePiece")
-                continue
-            if not os.path.exists(save_dir / model["_name"]):
-                download_and_unzip(f"{TATOEBA_MODELS_URL}/{model['release']}", save_dir / model["_name"])
-            # from convert_marian_to_pytorch
-            opus_language_groups_to_hf = convert_opus_name_to_hf_name
-            pair_name = opus_language_groups_to_hf(model["_name"])
-            convert(save_dir / model["_name"], dest_dir / f"opus-mt-{pair_name}")
-            self.write_model_card(model, dry_run=dry_run)
-
-    def expand_group_to_two_letter_codes(self, grp_name):
-        return [self.alpha3_to_alpha2.get(x, x) for x in GROUP_MEMBERS[grp_name][1]]
-
-    def is_group(self, code, name):
-        return "languages" in name or len(GROUP_MEMBERS.get(code, [])) > 1
-
-    def get_tags(self, code, name):
-        if len(code) == 2:
-            assert "languages" not in name, f"{code}: {name}"
-            return [code]
-        elif self.is_group(code, name):
-            group = self.expand_group_to_two_letter_codes(code)
-            group.append(code)
-            return group
-        else:  # zho-> zh
-            print(f"Three letter monolingual code: {code}")
-            return [code]
-
-    def resolve_lang_code(self, src, tgt) -> Tuple[str, str]:
-        src_tags = self.get_tags(src, self.tag2name[src])
-        tgt_tags = self.get_tags(tgt, self.tag2name[tgt])
-        return src_tags, tgt_tags
-
-    @staticmethod
-    def model_type_info_from_model_name(name):
-        info = {"_has_backtranslated_data": False}
-        if "1m" in name:
-            info["_data_per_pair"] = str(1e6)
-        if "2m" in name:
-            info["_data_per_pair"] = str(2e6)
-        if "4m" in name:
-            info["_data_per_pair"] = str(4e6)
-        if "+bt" in name:
-            info["_has_backtranslated_data"] = True
-        if "tuned4" in name:
-            info["_tuned"] = re.search(r"tuned4[^-]+", name).group()
-        return info
-
-    def write_model_card(self, model_dict, dry_run=False) -> str:
-        """
-        Construct card from data parsed from YAML and the model's name. upload command: aws s3 sync model_card_dir
-        s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-        """
-        model_dir_url = f"{TATOEBA_MODELS_URL}/{model_dict['release']}"
-        long_pair = model_dict["_name"].split("-")
-        assert len(long_pair) == 2, f"got a translation pair {model_dict['_name']} that doesn't appear to be a pair"
-        short_src = self.alpha3_to_alpha2.get(long_pair[0], long_pair[0])
-        short_tgt = self.alpha3_to_alpha2.get(long_pair[1], long_pair[1])
-        model_dict["_hf_model_id"] = f"opus-mt-{short_src}-{short_tgt}"
-
-        a3_src, a3_tgt = model_dict["_name"].split("-")
-        # opus_src_tags, opus_tgt_tags = a3_src.split("+"), a3_tgt.split("+")
-
-        # This messy part tries to deal with language tags in multilingual models, possibly
-        # not all having three-letter codes
-        resolved_src_tags, resolved_tgt_tags = self.resolve_lang_code(a3_src, a3_tgt)
-        a2_src_tags, a2_tgt_tags = [], []
-        for tag in resolved_src_tags:
-            if tag not in self.alpha3_to_alpha2:
-                a2_src_tags.append(tag)
-        for tag in resolved_tgt_tags:
-            if tag not in self.alpha3_to_alpha2:
-                a2_tgt_tags.append(tag)
-
-        lang_tags = dedup(a2_src_tags + a2_tgt_tags)
-        src_multilingual, tgt_multilingual = (len(a2_src_tags) > 1), (len(a2_tgt_tags) > 1)
-        s, t = ",".join(a2_src_tags), ",".join(a2_tgt_tags)
-
-        metadata = {
-            "hf_name": model_dict["_name"],
-            "source_languages": s,
-            "target_languages": t,
-            "opus_readme_url": f"{model_dir_url}/README.md",
-            "original_repo": "Tatoeba-Challenge",
-            "tags": ["translation"],
-            "languages": lang_tags,
-        }
-        lang_tags = l2front_matter(lang_tags)
-
-        metadata["src_constituents"] = list(GROUP_MEMBERS[a3_src][1])
-        metadata["tgt_constituents"] = list(GROUP_MEMBERS[a3_tgt][1])
-        metadata["src_multilingual"] = src_multilingual
-        metadata["tgt_multilingual"] = tgt_multilingual
-
-        backtranslated_data = ""
-        if model_dict["_has_backtranslated_data"]:
-            backtranslated_data = " with backtranslations"
-
-        multilingual_data = ""
-        if "_data_per_pair" in model_dict:
-            multilingual_data = f"* data per pair in multilingual model: {model_dict['_data_per_pair']}\n"
-
-        tuned = ""
-        if "_tuned" in model_dict:
-            tuned = f"* multilingual model tuned for: {model_dict['_tuned']}\n"
-
-        model_base_filename = model_dict["release"].split("/")[-1]
-        download = f"* download original weights: [{model_base_filename}]({model_dir_url}/{model_dict['release']})\n"
-
-        langtoken = ""
-        if tgt_multilingual:
-            langtoken = (
-                "* a sentence-initial language token is required in the form of >>id<<"
-                "(id = valid, usually three-letter target language ID)\n"
-            )
-
-        metadata.update(get_system_metadata(DEFAULT_REPO))
-
-        scorestable = ""
-        for k, v in model_dict.items():
-            if "scores" in k:
-                this_score_table = f"* {k}\n|Test set|score|\n|---|---|\n"
-                pairs = sorted(v.items(), key=lambda x: x[1], reverse=True)
-                for pair in pairs:
-                    this_score_table += f"|{pair[0]}|{pair[1]}|\n"
-                scorestable += this_score_table
-
-        datainfo = ""
-        if "training-data" in model_dict:
-            datainfo += "* Training data: \n"
-            for k, v in model_dict["training-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-        if "validation-data" in model_dict:
-            datainfo += "* Validation data: \n"
-            for k, v in model_dict["validation-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-        if "test-data" in model_dict:
-            datainfo += "* Test data: \n"
-            for k, v in model_dict["test-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-
-        testsetfilename = model_dict["release"].replace(".zip", ".test.txt")
-        testscoresfilename = model_dict["release"].replace(".zip", ".eval.txt")
-        testset = f"* test set translations file: [test.txt]({model_dir_url}/{testsetfilename})\n"
-        testscores = f"* test set scores file: [eval.txt]({model_dir_url}/{testscoresfilename})\n"
-
-        # combine with Tatoeba markdown
-        readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
-        extra_markdown = f"""
-### {model_dict['_name']}
-
-* source language name: {self.tag2name[a3_src]}
-* target language name: {self.tag2name[a3_tgt]}
-* OPUS readme: [README.md]({readme_url})
-"""
-
-        content = (
-            f"""
-* model: {model_dict['modeltype']}
-* source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)}
-* target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)}
-* dataset: opus {backtranslated_data}
-* release date: {model_dict['release-date']}
-* pre-processing: {model_dict['pre-processing']}
-"""
-            + multilingual_data
-            + tuned
-            + download
-            + langtoken
-            + datainfo
-            + testset
-            + testscores
-            + scorestable
-        )
-
-        content = FRONT_MATTER_TEMPLATE.format(lang_tags) + extra_markdown + content
-
-        items = "\n".join([f"* {k}: {v}" for k, v in metadata.items()])
-        sec3 = "\n### System Info: \n" + items
-        content += sec3
-        if dry_run:
-            print("CONTENT:")
-            print(content)
-            print("METADATA:")
-            print(metadata)
-            return
-        sub_dir = self.model_card_dir / model_dict["_hf_model_id"]
-        sub_dir.mkdir(exist_ok=True)
-        dest = sub_dir / "README.md"
-        dest.open("w").write(content)
-        for k, v in metadata.items():
-            if isinstance(v, datetime.date):
-                metadata[k] = datetime.datetime.strftime(v, "%Y-%m-%d")
-        with open(sub_dir / "metadata.json", "w", encoding="utf-8") as writeobj:
-            json.dump(metadata, writeobj)
-
-    def download_lang_info(self):
-        global LANG_CODE_PATH
-        Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True)
-        import wget
-        from huggingface_hub import hf_hub_download
-
-        if not os.path.exists(ISO_PATH):
-            wget.download(ISO_URL, ISO_PATH)
-        if not os.path.exists(LANG_CODE_PATH):
-            LANG_CODE_PATH = hf_hub_download(
-                repo_id="huggingface/language_codes_marianMT", filename="language-codes-3b2.csv", repo_type="dataset"
-            )
-
-    def parse_metadata(self, model_name, repo_path=DEFAULT_MODEL_DIR, method="best"):
-        p = Path(repo_path) / model_name
-
-        def url_to_name(url):
-            return url.split("/")[-1].split(".")[0]
-
-        if model_name not in self.model_results:
-            # This is not a language pair, so model results are ambiguous, go by newest
-            method = "newest"
-
-        if method == "best":
-            # Sort by how early they appear in released-models-results
-            results = [url_to_name(model["download"]) for model in self.model_results[model_name]]
-            ymls = [f for f in os.listdir(p) if f.endswith(".yml") and f[:-4] in results]
-            ymls.sort(key=lambda x: results.index(x[:-4]))
-            metadata = yaml.safe_load(open(p / ymls[0]))
-            metadata.update(self.model_type_info_from_model_name(ymls[0][:-4]))
-        elif method == "newest":
-            ymls = [f for f in os.listdir(p) if f.endswith(".yml")]
-            # Sort by date
-            ymls.sort(
-                key=lambda x: datetime.datetime.strptime(re.search(r"\d\d\d\d-\d\d?-\d\d?", x).group(), "%Y-%m-%d")
-            )
-            metadata = yaml.safe_load(open(p / ymls[-1]))
-            metadata.update(self.model_type_info_from_model_name(ymls[-1][:-4]))
-        else:
-            raise NotImplementedError(f"Don't know argument method='{method}' to parse_metadata()")
-        metadata["_name"] = model_name
-        return metadata
-
-
-GROUP_MEMBERS = {
-    # three letter code -> (group/language name, {constituents...}
-    # if this language is on the target side the constituents can be used as target language codes.
-    # if the language is on the source side they are supported natively without special codes.
-    "aav": ("Austro-Asiatic languages", {"hoc", "hoc_Latn", "kha", "khm", "khm_Latn", "mnw", "vie", "vie_Hani"}),
-    "afa": (
-        "Afro-Asiatic languages",
-        {
-            "acm",
-            "afb",
-            "amh",
-            "apc",
-            "ara",
-            "arq",
-            "ary",
-            "arz",
-            "hau_Latn",
-            "heb",
-            "kab",
-            "mlt",
-            "rif_Latn",
-            "shy_Latn",
-            "som",
-            "thv",
-            "tir",
-        },
-    ),
-    "afr": ("Afrikaans", {"afr"}),
-    "alv": (
-        "Atlantic-Congo languages",
-        {
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "ara": ("Arabic", {"afb", "apc", "apc_Latn", "ara", "ara_Latn", "arq", "arq_Latn", "arz"}),
-    "art": (
-        "Artificial languages",
-        {
-            "afh_Latn",
-            "avk_Latn",
-            "dws_Latn",
-            "epo",
-            "ido",
-            "ido_Latn",
-            "ile_Latn",
-            "ina_Latn",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "nov_Latn",
-            "qya",
-            "qya_Latn",
-            "sjn_Latn",
-            "tlh_Latn",
-            "tzl",
-            "tzl_Latn",
-            "vol_Latn",
-        },
-    ),
-    "aze": ("Azerbaijani", {"aze_Latn"}),
-    "bat": ("Baltic languages", {"lit", "lav", "prg_Latn", "ltg", "sgs"}),
-    "bel": ("Belarusian", {"bel", "bel_Latn"}),
-    "ben": ("Bengali", {"ben"}),
-    "bnt": (
-        "Bantu languages",
-        {"kin", "lin", "lug", "nya", "run", "sna", "swh", "toi_Latn", "tso", "umb", "xho", "zul"},
-    ),
-    "bul": ("Bulgarian", {"bul", "bul_Latn"}),
-    "cat": ("Catalan", {"cat"}),
-    "cau": ("Caucasian languages", {"abk", "kat", "che", "ady"}),
-    "ccs": ("South Caucasian languages", {"kat"}),
-    "ceb": ("Cebuano", {"ceb"}),
-    "cel": ("Celtic languages", {"gla", "gle", "bre", "cor", "glv", "cym"}),
-    "ces": ("Czech", {"ces"}),
-    "cpf": ("Creoles and pidgins, French‑based", {"gcf_Latn", "hat", "mfe"}),
-    "cpp": (
-        "Creoles and pidgins, Portuguese-based",
-        {"zsm_Latn", "ind", "pap", "min", "tmw_Latn", "max_Latn", "zlm_Latn"},
-    ),
-    "cus": ("Cushitic languages", {"som"}),
-    "dan": ("Danish", {"dan"}),
-    "deu": ("German", {"deu"}),
-    "dra": ("Dravidian languages", {"tam", "kan", "mal", "tel"}),
-    "ell": ("Modern Greek (1453-)", {"ell"}),
-    "eng": ("English", {"eng"}),
-    "epo": ("Esperanto", {"epo"}),
-    "est": ("Estonian", {"est"}),
-    "euq": ("Basque (family)", {"eus"}),
-    "eus": ("Basque", {"eus"}),
-    "fin": ("Finnish", {"fin"}),
-    "fiu": (
-        "Finno-Ugrian languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "fra": ("French", {"fra"}),
-    "gem": (
-        "Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "dan",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "fao",
-            "frr",
-            "fry",
-            "gos",
-            "got_Goth",
-            "gsw",
-            "isl",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "pdc",
-            "sco",
-            "stq",
-            "swe",
-            "swg",
-            "yid",
-        },
-    ),
-    "gle": ("Irish", {"gle"}),
-    "glg": ("Galician", {"glg"}),
-    "gmq": ("North Germanic languages", {"dan", "nob", "nob_Hebr", "swe", "isl", "nno", "non_Latn", "fao"}),
-    "gmw": (
-        "West Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "frr",
-            "fry",
-            "gos",
-            "gsw",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "pdc",
-            "sco",
-            "stq",
-            "swg",
-            "yid",
-        },
-    ),
-    "grk": ("Greek languages", {"grc_Grek", "ell"}),
-    "hbs": ("Serbo-Croatian", {"hrv", "srp_Cyrl", "bos_Latn", "srp_Latn"}),
-    "heb": ("Hebrew", {"heb"}),
-    "hin": ("Hindi", {"hin"}),
-    "hun": ("Hungarian", {"hun"}),
-    "hye": ("Armenian", {"hye", "hye_Latn"}),
-    "iir": (
-        "Indo-Iranian languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "jdt_Cyrl",
-            "kur_Arab",
-            "kur_Latn",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "oss",
-            "pan_Guru",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pnb",
-            "pus",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "urd",
-            "zza",
-        },
-    ),
-    "ilo": ("Iloko", {"ilo"}),
-    "inc": (
-        "Indic languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "pan_Guru",
-            "pnb",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "urd",
-        },
-    ),
-    "ine": (
-        "Indo-European languages",
-        {
-            "afr",
-            "afr_Arab",
-            "aln",
-            "ang_Latn",
-            "arg",
-            "asm",
-            "ast",
-            "awa",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bjn",
-            "bos_Latn",
-            "bre",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ces",
-            "cor",
-            "cos",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "egl",
-            "ell",
-            "eng",
-            "enm_Latn",
-            "ext",
-            "fao",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "gcf_Latn",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "gsw",
-            "guj",
-            "hat",
-            "hif_Latn",
-            "hin",
-            "hrv",
-            "hsb",
-            "hye",
-            "hye_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "jdt_Cyrl",
-            "ksh",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lav",
-            "lij",
-            "lit",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "mai",
-            "mar",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mkd",
-            "mwl",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "npi",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "pan_Guru",
-            "pap",
-            "pcd",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "prg_Latn",
-            "pus",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "rus",
-            "rus_Latn",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "sin",
-            "slv",
-            "snd_Arab",
-            "spa",
-            "sqi",
-            "srd",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "swe",
-            "swg",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "tmw_Latn",
-            "ukr",
-            "urd",
-            "vec",
-            "wln",
-            "yid",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zza",
-        },
-    ),
-    "isl": ("Icelandic", {"isl"}),
-    "ita": ("Italian", {"ita"}),
-    "itc": (
-        "Italic languages",
-        {
-            "arg",
-            "ast",
-            "bjn",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pcd",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "srd",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "jpn": ("Japanese", {"jpn", "jpn_Bopo", "jpn_Hang", "jpn_Hani", "jpn_Hira", "jpn_Kana", "jpn_Latn", "jpn_Yiii"}),
-    "jpx": ("Japanese (family)", {"jpn"}),
-    "kat": ("Georgian", {"kat"}),
-    "kor": ("Korean", {"kor_Hani", "kor_Hang", "kor_Latn", "kor"}),
-    "lav": ("Latvian", {"lav"}),
-    "lit": ("Lithuanian", {"lit"}),
-    "mkd": ("Macedonian", {"mkd"}),
-    "mkh": ("Mon-Khmer languages", {"vie_Hani", "mnw", "vie", "kha", "khm_Latn", "khm"}),
-    "msa": ("Malay (macrolanguage)", {"zsm_Latn", "ind", "max_Latn", "zlm_Latn", "min"}),
-    "mul": (
-        "Multiple languages",
-        {
-            "abk",
-            "acm",
-            "ady",
-            "afb",
-            "afh_Latn",
-            "afr",
-            "akl_Latn",
-            "aln",
-            "amh",
-            "ang_Latn",
-            "apc",
-            "ara",
-            "arg",
-            "arq",
-            "ary",
-            "arz",
-            "asm",
-            "ast",
-            "avk_Latn",
-            "awa",
-            "aze_Latn",
-            "bak",
-            "bam_Latn",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bod",
-            "bos_Latn",
-            "bre",
-            "brx",
-            "brx_Latn",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ceb",
-            "ces",
-            "cha",
-            "che",
-            "chr",
-            "chv",
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cor",
-            "cos",
-            "crh",
-            "crh_Latn",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "dtp",
-            "dws_Latn",
-            "egl",
-            "ell",
-            "enm_Latn",
-            "epo",
-            "est",
-            "eus",
-            "ewe",
-            "ext",
-            "fao",
-            "fij",
-            "fin",
-            "fkv_Latn",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "fuc",
-            "fuv",
-            "gan",
-            "gcf_Latn",
-            "gil",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "grn",
-            "gsw",
-            "guj",
-            "hat",
-            "hau_Latn",
-            "haw",
-            "heb",
-            "hif_Latn",
-            "hil",
-            "hin",
-            "hnj_Latn",
-            "hoc",
-            "hoc_Latn",
-            "hrv",
-            "hsb",
-            "hun",
-            "hye",
-            "iba",
-            "ibo",
-            "ido",
-            "ido_Latn",
-            "ike_Latn",
-            "ile_Latn",
-            "ilo",
-            "ina_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "izh",
-            "jav",
-            "jav_Java",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "jdt_Cyrl",
-            "jpn",
-            "kab",
-            "kal",
-            "kan",
-            "kat",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kek_Latn",
-            "kha",
-            "khm",
-            "khm_Latn",
-            "kin",
-            "kir_Cyrl",
-            "kjh",
-            "kpv",
-            "krl",
-            "ksh",
-            "kum",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lao",
-            "lat_Latn",
-            "lav",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "lij",
-            "lin",
-            "lit",
-            "liv_Latn",
-            "lkt",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "lug",
-            "lzh",
-            "lzh_Hans",
-            "mad",
-            "mah",
-            "mai",
-            "mal",
-            "mar",
-            "max_Latn",
-            "mdf",
-            "mfe",
-            "mhr",
-            "mic",
-            "min",
-            "mkd",
-            "mlg",
-            "mlt",
-            "mnw",
-            "moh",
-            "mon",
-            "mri",
-            "mwl",
-            "mww",
-            "mya",
-            "myv",
-            "nan",
-            "nau",
-            "nav",
-            "nds",
-            "niu",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "nog",
-            "non_Latn",
-            "nov_Latn",
-            "npi",
-            "nya",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "ota_Arab",
-            "ota_Latn",
-            "pag",
-            "pan_Guru",
-            "pap",
-            "pau",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "ppl_Latn",
-            "prg_Latn",
-            "pus",
-            "quc",
-            "qya",
-            "qya_Latn",
-            "rap",
-            "rif_Latn",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "run",
-            "rus",
-            "sag",
-            "sah",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "shs_Latn",
-            "shy_Latn",
-            "sin",
-            "sjn_Latn",
-            "slv",
-            "sma",
-            "sme",
-            "smo",
-            "sna",
-            "snd_Arab",
-            "som",
-            "spa",
-            "sqi",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "sun",
-            "swe",
-            "swg",
-            "swh",
-            "tah",
-            "tam",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tel",
-            "tet",
-            "tgk_Cyrl",
-            "tha",
-            "tir",
-            "tlh_Latn",
-            "tly_Latn",
-            "tmw_Latn",
-            "toi_Latn",
-            "ton",
-            "tpw_Latn",
-            "tso",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tvl",
-            "tyv",
-            "tzl",
-            "tzl_Latn",
-            "udm",
-            "uig_Arab",
-            "uig_Cyrl",
-            "ukr",
-            "umb",
-            "urd",
-            "uzb_Cyrl",
-            "uzb_Latn",
-            "vec",
-            "vie",
-            "vie_Hani",
-            "vol_Latn",
-            "vro",
-            "war",
-            "wln",
-            "wol",
-            "wuu",
-            "xal",
-            "xho",
-            "yid",
-            "yor",
-            "yue",
-            "yue_Hans",
-            "yue_Hant",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zul",
-            "zza",
-        },
-    ),
-    "nic": (
-        "Niger-Kordofanian languages",
-        {
-            "bam_Latn",
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "nld": ("Dutch", {"nld"}),
-    "nor": ("Norwegian", {"nob", "nno"}),
-    "phi": ("Philippine languages", {"ilo", "akl_Latn", "war", "hil", "pag", "ceb"}),
-    "pol": ("Polish", {"pol"}),
-    "por": ("Portuguese", {"por"}),
-    "pqe": (
-        "Eastern Malayo-Polynesian languages",
-        {"fij", "gil", "haw", "mah", "mri", "nau", "niu", "rap", "smo", "tah", "ton", "tvl"},
-    ),
-    "roa": (
-        "Romance languages",
-        {
-            "arg",
-            "ast",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "ron": ("Romanian", {"ron"}),
-    "run": ("Rundi", {"run"}),
-    "rus": ("Russian", {"rus"}),
-    "sal": ("Salishan languages", {"shs_Latn"}),
-    "sem": ("Semitic languages", {"acm", "afb", "amh", "apc", "ara", "arq", "ary", "arz", "heb", "mlt", "tir"}),
-    "sla": (
-        "Slavic languages",
-        {
-            "bel",
-            "bel_Latn",
-            "bos_Latn",
-            "bul",
-            "bul_Latn",
-            "ces",
-            "csb_Latn",
-            "dsb",
-            "hrv",
-            "hsb",
-            "mkd",
-            "orv_Cyrl",
-            "pol",
-            "rue",
-            "rus",
-            "slv",
-            "srp_Cyrl",
-            "srp_Latn",
-            "ukr",
-        },
-    ),
-    "slv": ("Slovenian", {"slv"}),
-    "spa": ("Spanish", {"spa"}),
-    "swe": ("Swedish", {"swe"}),
-    "taw": ("Tai", {"lao", "tha"}),
-    "tgl": ("Tagalog", {"tgl_Latn"}),
-    "tha": ("Thai", {"tha"}),
-    "trk": (
-        "Turkic languages",
-        {
-            "aze_Latn",
-            "bak",
-            "chv",
-            "crh",
-            "crh_Latn",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kir_Cyrl",
-            "kjh",
-            "kum",
-            "ota_Arab",
-            "ota_Latn",
-            "sah",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tyv",
-            "uig_Arab",
-            "uig_Cyrl",
-            "uzb_Cyrl",
-            "uzb_Latn",
-        },
-    ),
-    "tur": ("Turkish", {"tur"}),
-    "ukr": ("Ukrainian", {"ukr"}),
-    "urd": ("Urdu", {"urd"}),
-    "urj": (
-        "Uralic languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "vie": ("Vietnamese", {"vie", "vie_Hani"}),
-    "war": ("Waray (Philippines)", {"war"}),
-    "zho": (
-        "Chinese",
-        {
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Bopo",
-            "cmn_Hang",
-            "cmn_Hani",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cmn_Hira",
-            "cmn_Kana",
-            "cmn_Latn",
-            "cmn_Yiii",
-            "gan",
-            "hak_Hani",
-            "lzh",
-            "lzh_Bopo",
-            "lzh_Hang",
-            "lzh_Hani",
-            "lzh_Hans",
-            "lzh_Hira",
-            "lzh_Kana",
-            "lzh_Yiii",
-            "nan",
-            "nan_Hani",
-            "wuu",
-            "wuu_Bopo",
-            "wuu_Hani",
-            "wuu_Latn",
-            "yue",
-            "yue_Bopo",
-            "yue_Hang",
-            "yue_Hani",
-            "yue_Hans",
-            "yue_Hant",
-            "yue_Hira",
-            "yue_Kana",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-        },
-    ),
-    "zle": ("East Slavic languages", {"bel", "orv_Cyrl", "bel_Latn", "rus", "ukr", "rue"}),
-    "zls": ("South Slavic languages", {"bos_Latn", "bul", "bul_Latn", "hrv", "mkd", "slv", "srp_Cyrl", "srp_Latn"}),
-    "zlw": ("West Slavic languages", {"csb_Latn", "dsb", "hsb", "pol", "ces"}),
-}
-
-
-def l2front_matter(langs):
-    return "".join(f"- {l}\n" for l in langs)
-
-
-def dedup(lst):
-    """Preservers order"""
-    new_lst = []
-    for item in lst:
-        if not item or item in new_lst:
-            continue
-        else:
-            new_lst.append(item)
-    return new_lst
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m", "--models", action="append", help="<Required> Set flag", required=True, nargs="+", dest="models"
-    )
-    parser.add_argument("-save_dir", "--save_dir", default="marian_converted", help="where to save converted models")
-    args = parser.parse_args()
-    resolver = TatoebaConverter(save_dir=args.save_dir)
-    resolver.convert_models(args.models[0])
diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py
deleted file mode 100644
index 4cc9b15ce4d5..000000000000
--- a/src/transformers/models/marian/convert_marian_to_pytorch.py
+++ /dev/null
@@ -1,717 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-import socket
-import time
-import warnings
-from pathlib import Path
-from typing import Dict, List, Union
-from zipfile import ZipFile
-
-import numpy as np
-import torch
-from huggingface_hub.hf_api import list_models
-from torch import nn
-from tqdm import tqdm
-
-from transformers import MarianConfig, MarianMTModel, MarianTokenizer
-
-
-def remove_suffix(text: str, suffix: str):
-    if text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text  # or whatever
-
-
-def remove_prefix(text: str, prefix: str):
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text  # or whatever
-
-
-def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
-    sd = {}
-    for k in opus_dict:
-        if not k.startswith(layer_prefix):
-            continue
-        stripped = remove_prefix(k, layer_prefix)
-        v = opus_dict[k].T  # besides embeddings, everything must be transposed.
-        sd[converter[stripped]] = torch.tensor(v).squeeze()
-    return sd
-
-
-def load_layers_(layer_lst: nn.ModuleList, opus_state: dict, converter, is_decoder=False):
-    for i, layer in enumerate(layer_lst):
-        layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_"
-        sd = convert_encoder_layer(opus_state, layer_tag, converter)
-        layer.load_state_dict(sd, strict=False)
-
-
-def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
-    """Find models that can accept src_lang as input and return tgt_lang as output."""
-    prefix = "Helsinki-NLP/opus-mt-"
-    model_list = list_models()
-    model_ids = [x.id for x in model_list if x.id.startswith("Helsinki-NLP")]
-    src_and_targ = [
-        remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
-    ]  # + cant be loaded.
-    matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
-    return matching
-
-
-def add_emb_entries(wemb, final_bias, n_special_tokens=1):
-    vsize, d_model = wemb.shape
-    embs_to_add = np.zeros((n_special_tokens, d_model))
-    new_embs = np.concatenate([wemb, embs_to_add])
-    bias_to_add = np.zeros((n_special_tokens, 1))
-    new_bias = np.concatenate((final_bias, bias_to_add), axis=1)
-    return new_embs, new_bias
-
-
-def _cast_yaml_str(v):
-    bool_dct = {"true": True, "false": False}
-    if not isinstance(v, str):
-        return v
-    elif v in bool_dct:
-        return bool_dct[v]
-    try:
-        return int(v)
-    except (TypeError, ValueError):
-        return v
-
-
-def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict:
-    return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()}
-
-
-CONFIG_KEY = "special:model.yml"
-
-
-def load_config_from_state_dict(opus_dict):
-    import yaml
-
-    cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]])
-    yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader)
-    return cast_marian_config(yaml_cfg)
-
-
-def find_model_file(dest_dir):  # this one better
-    model_files = list(Path(dest_dir).glob("*.npz"))
-    if len(model_files) != 1:
-        raise ValueError(f"Found more than one model file: {model_files}")
-    model_file = model_files[0]
-    return model_file
-
-
-# Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE
-ROM_GROUP = (
-    "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT"
-    "+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co"
-    "+nap+scn+vec+sc+ro+la"
-)
-GROUPS = [
-    ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"),
-    (ROM_GROUP, "ROMANCE"),
-    ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"),
-    ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"),
-    ("se+sma+smj+smn+sms", "SAMI"),
-    ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"),
-    ("ga+cy+br+gd+kw+gv", "CELTIC"),  # https://en.wikipedia.org/wiki/Insular_Celtic_languages
-]
-GROUP_TO_OPUS_NAME = {
-    "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de",
-    "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-    "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv",
-    "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv",
-    "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv",
-    "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi",
-    "opus-mt-en-ROMANCE": (
-        "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-        "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-        "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la"
-    ),
-    "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv",
-    "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms",
-    "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-ROMANCE-en": (
-        "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-        "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-        "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en"
-    ),
-    "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en",
-    "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-}
-OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/"
-ORG_NAME = "Helsinki-NLP/"
-
-
-def convert_opus_name_to_hf_name(x):
-    """For OPUS-MT-Train/ DEPRECATED"""
-    for substr, grp_name in GROUPS:
-        x = x.replace(substr, grp_name)
-    return x.replace("+", "_")
-
-
-def convert_hf_name_to_opus_name(hf_model_name):
-    """
-    Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.
-    """
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    if hf_model_name in GROUP_TO_OPUS_NAME:
-        opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name]
-    else:
-        opus_w_prefix = hf_model_name.replace("_", "+")
-    return remove_prefix(opus_w_prefix, "opus-mt-")
-
-
-def get_system_metadata(repo_root):
-    import git
-
-    return {
-        "helsinki_git_sha": git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
-        "transformers_git_sha": git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
-        "port_machine": socket.gethostname(),
-        "port_time": time.strftime("%Y-%m-%d-%H:%M"),
-    }
-
-
-# docstyle-ignore
-FRONT_MATTER_TEMPLATE = """---
-language:
-{}
-tags:
-- translation
-
-license: apache-2.0
----
-"""
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-
-
-def write_model_card(
-    hf_model_name: str,
-    repo_root=DEFAULT_REPO,
-    save_dir=Path("marian_converted"),
-    dry_run=False,
-    extra_metadata={},
-) -> str:
-    """
-    Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir
-    s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-    """
-    import pandas as pd
-
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    opus_name: str = convert_hf_name_to_opus_name(hf_model_name)
-    if repo_root not in ("OPUS-MT-train", "Tatoeba-Challenge"):
-        raise ValueError(f"Repos root is {repo_root}. Expected either OPUS-MT-train or Tatoeba-Challenge")
-    opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
-    if not (opus_readme_path.exists()):
-        raise ValueError(f"Readme file {opus_readme_path} not found")
-
-    opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
-
-    readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
-
-    s, t = ",".join(opus_src), ",".join(opus_tgt)
-    metadata = {
-        "hf_name": hf_model_name,
-        "source_languages": s,
-        "target_languages": t,
-        "opus_readme_url": readme_url,
-        "original_repo": repo_root,
-        "tags": ["translation"],
-    }
-    metadata.update(extra_metadata)
-    metadata.update(get_system_metadata(repo_root))
-
-    # combine with opus markdown
-
-    extra_markdown = (
-        f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: "
-        f"{metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
-    )
-
-    content = opus_readme_path.open().read()
-    content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
-    splat = content.split("*")[2:]
-    print(splat[3])
-    content = "*".join(splat)
-    content = (
-        FRONT_MATTER_TEMPLATE.format(metadata["src_alpha2"])
-        + extra_markdown
-        + "\n* "
-        + content.replace("download", "download original weights")
-    )
-
-    items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
-    sec3 = "\n### System Info: \n" + items
-    content += sec3
-    if dry_run:
-        return content, metadata
-    sub_dir = save_dir / f"opus-mt-{hf_model_name}"
-    sub_dir.mkdir(exist_ok=True)
-    dest = sub_dir / "README.md"
-    dest.open("w").write(content)
-    pd.Series(metadata).to_json(sub_dir / "metadata.json")
-
-    # if dry_run:
-    return content, metadata
-
-
-def make_registry(repo_path="Opus-MT-train/models"):
-    if not (Path(repo_path) / "fr-en" / "README.md").exists():
-        raise ValueError(
-            f"repo_path:{repo_path} does not exist: "
-            "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
-        )
-    results = {}
-    for p in Path(repo_path).iterdir():
-        n_dash = p.name.count("-")
-        if n_dash == 0:
-            continue
-        else:
-            lns = list(open(p / "README.md").readlines())
-            results[p.name] = _parse_readme(lns)
-    return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
-
-
-def convert_all_sentencepiece_models(model_list=None, repo_path=None, dest_dir=Path("marian_converted")):
-    """Requires 300GB"""
-    save_dir = Path("marian_ckpt")
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-    save_paths = []
-    if model_list is None:
-        model_list: list = make_registry(repo_path=repo_path)
-    for k, prepro, download, test_set_url in tqdm(model_list):
-        if "SentencePiece" not in prepro:  # dont convert BPE models.
-            continue
-        if not os.path.exists(save_dir / k):
-            download_and_unzip(download, save_dir / k)
-        pair_name = convert_opus_name_to_hf_name(k)
-        convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}")
-
-        save_paths.append(dest_dir / f"opus-mt-{pair_name}")
-    return save_paths
-
-
-def lmap(f, x) -> List:
-    return list(map(f, x))
-
-
-def fetch_test_set(test_set_url):
-    import wget
-
-    fname = wget.download(test_set_url, "opus_test.txt")
-    lns = Path(fname).open().readlines()
-    src = lmap(str.strip, lns[::4])
-    gold = lmap(str.strip, lns[1::4])
-    mar_model = lmap(str.strip, lns[2::4])
-    if not (len(gold) == len(mar_model) == len(src)):
-        raise ValueError(f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched")
-    os.remove(fname)
-    return src, mar_model, gold
-
-
-def convert_whole_dir(path=Path("marian_ckpt/")):
-    for subdir in tqdm(list(path.ls())):
-        dest_dir = f"marian_converted/{subdir.name}"
-        if (dest_dir / "pytorch_model.bin").exists():
-            continue
-        convert(source_dir, dest_dir)
-
-
-def _parse_readme(lns):
-    """Get link and metadata from opus model card equivalent."""
-    subres = {}
-    for ln in [x.strip() for x in lns]:
-        if not ln.startswith("*"):
-            continue
-        ln = ln[1:].strip()
-
-        for k in ["download", "dataset", "models", "model", "pre-processing"]:
-            if ln.startswith(k):
-                break
-        else:
-            continue
-        if k in ["dataset", "model", "pre-processing"]:
-            splat = ln.split(":")
-            _, v = splat
-            subres[k] = v
-        elif k == "download":
-            v = ln.split("(")[-1][:-1]
-            subres[k] = v
-    return subres
-
-
-def save_tokenizer_config(dest_dir: Path, separate_vocabs=False):
-    dname = dest_dir.name.split("-")
-    dct = {"target_lang": dname[-1], "source_lang": "-".join(dname[:-1]), "separate_vocabs": separate_vocabs}
-    save_json(dct, dest_dir / "tokenizer_config.json")
-
-
-def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]):
-    start = max(vocab.values()) + 1
-    added = 0
-    for tok in special_tokens:
-        if tok in vocab:
-            continue
-        vocab[tok] = start + added
-        added += 1
-    return added
-
-
-def find_vocab_file(model_dir):
-    return list(model_dir.glob("*vocab.yml"))[0]
-
-
-def find_src_vocab_file(model_dir):
-    return list(model_dir.glob("*src.vocab.yml"))[0]
-
-
-def find_tgt_vocab_file(model_dir):
-    return list(model_dir.glob("*trg.vocab.yml"))[0]
-
-
-def add_special_tokens_to_vocab(model_dir: Path, separate_vocab=False) -> None:
-    if separate_vocab:
-        vocab = load_yaml(find_src_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        save_json(vocab, model_dir / "vocab.json")
-
-        vocab = load_yaml(find_tgt_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        save_json(vocab, model_dir / "target_vocab.json")
-        save_tokenizer_config(model_dir, separate_vocabs=separate_vocab)
-    else:
-        vocab = load_yaml(find_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        print(f"added {num_added} tokens to vocab")
-        save_json(vocab, model_dir / "vocab.json")
-        save_tokenizer_config(model_dir)
-
-
-def check_equal(marian_cfg, k1, k2):
-    v1, v2 = marian_cfg[k1], marian_cfg[k2]
-    if v1 != v2:
-        raise ValueError(f"hparams {k1},{k2} differ: {v1} != {v2}")
-
-
-def check_marian_cfg_assumptions(marian_cfg):
-    assumed_settings = {
-        "layer-normalization": False,
-        "right-left": False,
-        "transformer-ffn-depth": 2,
-        "transformer-aan-depth": 2,
-        "transformer-no-projection": False,
-        "transformer-postprocess-emb": "d",
-        "transformer-postprocess": "dan",  # Dropout, add, normalize
-        "transformer-preprocess": "",
-        "type": "transformer",
-        "ulr-dim-emb": 0,
-        "dec-cell-base-depth": 2,
-        "dec-cell-high-depth": 1,
-        "transformer-aan-nogate": False,
-    }
-    for k, v in assumed_settings.items():
-        actual = marian_cfg[k]
-        if actual != v:
-            raise ValueError(f"Unexpected config value for {k} expected {v} got {actual}")
-
-
-BIAS_KEY = "decoder_ff_logit_out_b"
-BART_CONVERTER = {  # for each encoder and decoder layer
-    "self_Wq": "self_attn.q_proj.weight",
-    "self_Wk": "self_attn.k_proj.weight",
-    "self_Wv": "self_attn.v_proj.weight",
-    "self_Wo": "self_attn.out_proj.weight",
-    "self_bq": "self_attn.q_proj.bias",
-    "self_bk": "self_attn.k_proj.bias",
-    "self_bv": "self_attn.v_proj.bias",
-    "self_bo": "self_attn.out_proj.bias",
-    "self_Wo_ln_scale": "self_attn_layer_norm.weight",
-    "self_Wo_ln_bias": "self_attn_layer_norm.bias",
-    "ffn_W1": "fc1.weight",
-    "ffn_b1": "fc1.bias",
-    "ffn_W2": "fc2.weight",
-    "ffn_b2": "fc2.bias",
-    "ffn_ffn_ln_scale": "final_layer_norm.weight",
-    "ffn_ffn_ln_bias": "final_layer_norm.bias",
-    # Decoder Cross Attention
-    "context_Wk": "encoder_attn.k_proj.weight",
-    "context_Wo": "encoder_attn.out_proj.weight",
-    "context_Wq": "encoder_attn.q_proj.weight",
-    "context_Wv": "encoder_attn.v_proj.weight",
-    "context_bk": "encoder_attn.k_proj.bias",
-    "context_bo": "encoder_attn.out_proj.bias",
-    "context_bq": "encoder_attn.q_proj.bias",
-    "context_bv": "encoder_attn.v_proj.bias",
-    "context_Wo_ln_scale": "encoder_attn_layer_norm.weight",
-    "context_Wo_ln_bias": "encoder_attn_layer_norm.bias",
-}
-
-
-class OpusState:
-    def __init__(self, source_dir, eos_token_id=0):
-        npz_path = find_model_file(source_dir)
-        self.state_dict = np.load(npz_path)
-        cfg = load_config_from_state_dict(self.state_dict)
-        if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]:
-            raise ValueError
-        if "Wpos" in self.state_dict:
-            raise ValueError("Wpos key in state dictionary")
-        self.state_dict = dict(self.state_dict)
-        if cfg["tied-embeddings-all"]:
-            cfg["tied-embeddings-src"] = True
-            cfg["tied-embeddings"] = True
-        self.share_encoder_decoder_embeddings = cfg["tied-embeddings-src"]
-
-        # create the tokenizer here because we need to know the eos_token_id
-        self.source_dir = source_dir
-        self.tokenizer = self.load_tokenizer()
-        # retrieve EOS token and set correctly
-        tokenizer_has_eos_token_id = (
-            hasattr(self.tokenizer, "eos_token_id") and self.tokenizer.eos_token_id is not None
-        )
-        eos_token_id = self.tokenizer.eos_token_id if tokenizer_has_eos_token_id else 0
-
-        if cfg["tied-embeddings-src"]:
-            self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1)
-            self.pad_token_id = self.wemb.shape[0] - 1
-            cfg["vocab_size"] = self.pad_token_id + 1
-        else:
-            self.wemb, _ = add_emb_entries(self.state_dict["encoder_Wemb"], self.state_dict[BIAS_KEY], 1)
-            self.dec_wemb, self.final_bias = add_emb_entries(
-                self.state_dict["decoder_Wemb"], self.state_dict[BIAS_KEY], 1
-            )
-            # still assuming that vocab size is same for encoder and decoder
-            self.pad_token_id = self.wemb.shape[0] - 1
-            cfg["vocab_size"] = self.pad_token_id + 1
-            cfg["decoder_vocab_size"] = self.pad_token_id + 1
-
-        if cfg["vocab_size"] != self.tokenizer.vocab_size:
-            raise ValueError(
-                f"Original vocab size {cfg['vocab_size']} and new vocab size {len(self.tokenizer.encoder)} mismatched."
-            )
-
-        # self.state_dict['Wemb'].sha
-        self.state_keys = list(self.state_dict.keys())
-        if "Wtype" in self.state_dict:
-            raise ValueError("Wtype key in state dictionary")
-        self._check_layer_entries()
-        self.cfg = cfg
-        hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape
-        if hidden_size != cfg["dim-emb"]:
-            raise ValueError(f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched")
-
-        # Process decoder.yml
-        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
-        check_marian_cfg_assumptions(cfg)
-        self.hf_config = MarianConfig(
-            vocab_size=cfg["vocab_size"],
-            decoder_vocab_size=cfg.get("decoder_vocab_size", cfg["vocab_size"]),
-            share_encoder_decoder_embeddings=cfg["tied-embeddings-src"],
-            decoder_layers=cfg["dec-depth"],
-            encoder_layers=cfg["enc-depth"],
-            decoder_attention_heads=cfg["transformer-heads"],
-            encoder_attention_heads=cfg["transformer-heads"],
-            decoder_ffn_dim=cfg["transformer-dim-ffn"],
-            encoder_ffn_dim=cfg["transformer-dim-ffn"],
-            d_model=cfg["dim-emb"],
-            activation_function=cfg["transformer-ffn-activation"],
-            pad_token_id=self.pad_token_id,
-            eos_token_id=eos_token_id,
-            forced_eos_token_id=eos_token_id,
-            bos_token_id=0,
-            max_position_embeddings=cfg["dim-emb"],
-            scale_embedding=True,
-            normalize_embedding="n" in cfg["transformer-preprocess"],
-            static_position_embeddings=not cfg["transformer-train-position-embeddings"],
-            tie_word_embeddings=cfg["tied-embeddings"],
-            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
-            # default: add_final_layer_norm=False,
-            num_beams=decoder_yml["beam-size"],
-            decoder_start_token_id=self.pad_token_id,
-            bad_words_ids=[[self.pad_token_id]],
-            max_length=512,
-        )
-
-    def _check_layer_entries(self):
-        self.encoder_l1 = self.sub_keys("encoder_l1")
-        self.decoder_l1 = self.sub_keys("decoder_l1")
-        self.decoder_l2 = self.sub_keys("decoder_l2")
-        if len(self.encoder_l1) != 16:
-            warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}")
-        if len(self.decoder_l1) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-        if len(self.decoder_l2) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-
-    @property
-    def extra_keys(self):
-        extra = []
-        for k in self.state_keys:
-            if (
-                k.startswith("encoder_l")
-                or k.startswith("decoder_l")
-                or k in [CONFIG_KEY, "Wemb", "encoder_Wemb", "decoder_Wemb", "Wpos", "decoder_ff_logit_out_b"]
-            ):
-                continue
-            else:
-                extra.append(k)
-        return extra
-
-    def sub_keys(self, layer_prefix):
-        return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)]
-
-    def load_tokenizer(self):
-        # save tokenizer
-        add_special_tokens_to_vocab(self.source_dir, not self.share_encoder_decoder_embeddings)
-        return MarianTokenizer.from_pretrained(str(self.source_dir))
-
-    def load_marian_model(self) -> MarianMTModel:
-        state_dict, cfg = self.state_dict, self.hf_config
-
-        if not cfg.static_position_embeddings:
-            raise ValueError("config.static_position_embeddings should be True")
-        model = MarianMTModel(cfg)
-
-        if "hidden_size" in cfg.to_dict():
-            raise ValueError("hidden_size is in config")
-        load_layers_(
-            model.model.encoder.layers,
-            state_dict,
-            BART_CONVERTER,
-        )
-        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
-
-        # handle tensors not associated with layers
-        if self.cfg["tied-embeddings-src"]:
-            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
-            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
-            model.model.shared.weight = wemb_tensor
-            model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared
-        else:
-            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
-            model.model.encoder.embed_tokens.weight = wemb_tensor
-
-            decoder_wemb_tensor = nn.Parameter(torch.FloatTensor(self.dec_wemb))
-            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
-            model.model.decoder.embed_tokens.weight = decoder_wemb_tensor
-
-        # handle tied embeddings, otherwise "from_pretrained" loads them incorrectly
-        if self.cfg["tied-embeddings"]:
-            model.lm_head.weight.data = model.model.decoder.embed_tokens.weight.data.clone()
-
-        model.final_logits_bias = bias_tensor
-
-        if "Wpos" in state_dict:
-            print("Unexpected: got Wpos")
-            wpos_tensor = torch.tensor(state_dict["Wpos"])
-            model.model.encoder.embed_positions.weight = wpos_tensor
-            model.model.decoder.embed_positions.weight = wpos_tensor
-
-        if cfg.normalize_embedding:
-            if "encoder_emb_ln_scale_pre" not in state_dict:
-                raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary")
-            raise NotImplementedError("Need to convert layernorm_embedding")
-
-        if self.extra_keys:
-            raise ValueError(f"Failed to convert {self.extra_keys}")
-
-        if model.get_input_embeddings().padding_idx != self.pad_token_id:
-            raise ValueError(
-                f"Padding tokens {model.get_input_embeddings().padding_idx} and {self.pad_token_id} mismatched"
-            )
-        return model
-
-
-def download_and_unzip(url, dest_dir):
-    try:
-        import wget
-    except ImportError:
-        raise ImportError("you must pip install wget")
-
-    filename = wget.download(url)
-    unzip(filename, dest_dir)
-    os.remove(filename)
-
-
-def convert(source_dir: Path, dest_dir):
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-
-    opus_state = OpusState(source_dir)
-
-    # save tokenizer
-    opus_state.tokenizer.save_pretrained(dest_dir)
-
-    # save_json(opus_state.cfg, dest_dir / "marian_original_config.json")
-    # ^^ Uncomment to save human readable marian config for debugging
-
-    model = opus_state.load_marian_model()
-    model = model.half()
-    model.save_pretrained(dest_dir)
-    model.from_pretrained(dest_dir)  # sanity check
-
-
-def load_yaml(path):
-    import yaml
-
-    with open(path, encoding="utf-8") as f:
-        return yaml.load(f, Loader=yaml.BaseLoader)
-
-
-def save_json(content: Union[Dict, List], path: str) -> None:
-    with open(path, "w") as f:
-        json.dump(content, f)
-
-
-def unzip(zip_path: str, dest_dir: str) -> None:
-    with ZipFile(zip_path, "r") as zipObj:
-        zipObj.extractall(dest_dir)
-
-
-if __name__ == "__main__":
-    """
-    Tatoeba conversion instructions in scripts/tatoeba/README.md
-    """
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--src",
-        type=str,
-        help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be wary of which file you're loading.",
-        default="en-de",
-    )
-    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    source_dir = Path(args.src)
-    if not source_dir.exists():
-        raise ValueError(f"Source directory {source_dir} not found")
-    dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
-    convert(source_dir, dest_dir)
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index ea1c578509f6..000000000000
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,1019 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.projects.deeplab import add_deeplab_config
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torch import Tensor, nn
-
-from transformers import (
-    Mask2FormerConfig,
-    Mask2FormerForUniversalSegmentation,
-    Mask2FormerImageProcessor,
-    Mask2FormerModel,
-    SwinConfig,
-)
-from transformers.models.mask2former.modeling_mask2former import (
-    Mask2FormerForUniversalSegmentationOutput,
-    Mask2FormerModelOutput,
-)
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by mask2former/detectron implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_maskformer2_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalMask2FormerConfigToOursConverter:
-    def __call__(self, original_config: object) -> Mask2FormerConfig:
-        model = original_config.MODEL
-
-        repo_id = "huggingface/label-files"
-        if model.SEM_SEG_HEAD.NUM_CLASSES == 847:
-            filename = "mask2former-ade20k-full-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 150:
-            filename = "ade20k-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 80:
-            filename = "coco-detection-mmdet-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 171:
-            filename = "mask2former-coco-stuff-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 133:
-            filename = "coco-panoptic-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 19:
-            filename = "cityscapes-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 8:
-            filename = "cityscapes-instance-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 65:
-            filename = "mapillary-vistas-id2label.json"
-
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        if model.SWIN.EMBED_DIM == 96:
-            backbone_config = SwinConfig.from_pretrained(
-                "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
-            )
-        elif model.SWIN.EMBED_DIM == 128:
-            backbone_config = SwinConfig(
-                embed_dim=128,
-                window_size=12,
-                depths=(2, 2, 18, 2),
-                num_heads=(4, 8, 16, 32),
-                out_features=["stage1", "stage2", "stage3", "stage4"],
-            )
-
-        elif model.SWIN.EMBED_DIM == 192:
-            backbone_config = SwinConfig.from_pretrained(
-                "microsoft/swin-large-patch4-window12-384", out_features=["stage1", "stage2", "stage3", "stage4"]
-            )
-        else:
-            raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
-
-        backbone_config.drop_path_rate = model.SWIN.DROP_PATH_RATE
-        backbone_config.attention_probs_dropout_prob = model.SWIN.ATTN_DROP_RATE
-        backbone_config.depths = model.SWIN.DEPTHS
-
-        config: Mask2FormerConfig = Mask2FormerConfig(
-            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            num_queries=model.MASK_FORMER.NUM_OBJECT_QUERIES,
-            no_object_weight=model.MASK_FORMER.NO_OBJECT_WEIGHT,
-            class_weight=model.MASK_FORMER.CLASS_WEIGHT,
-            mask_weight=model.MASK_FORMER.MASK_WEIGHT,
-            dice_weight=model.MASK_FORMER.DICE_WEIGHT,
-            train_num_points=model.MASK_FORMER.TRAIN_NUM_POINTS,
-            oversample_ratio=model.MASK_FORMER.OVERSAMPLE_RATIO,
-            importance_sample_ratio=model.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
-            init_std=0.02,
-            init_xavier_std=1.0,
-            use_auxiliary_loss=model.MASK_FORMER.DEEP_SUPERVISION,
-            feature_strides=[4, 8, 16, 32],
-            backbone_config=backbone_config,
-            id2label=id2label,
-            label2id=label2id,
-            feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
-            hidden_dim=model.MASK_FORMER.HIDDEN_DIM,
-            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
-            encoder_feedforward_dim=1024,
-            decoder_layers=model.MASK_FORMER.DEC_LAYERS,
-            num_attention_heads=model.MASK_FORMER.NHEADS,
-            dropout=model.MASK_FORMER.DROPOUT,
-            dim_feedforward=model.MASK_FORMER.DIM_FEEDFORWARD,
-            pre_norm=model.MASK_FORMER.PRE_NORM,
-            enforce_input_proj=model.MASK_FORMER.ENFORCE_INPUT_PROJ,
-            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
-        )
-        return config
-
-
-class OriginalMask2FormerConfigToImageProcessorConverter:
-    def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-
-        return Mask2FormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            size_divisibility=32,
-        )
-
-
-class OriginalMask2FormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: Mask2FormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    def replace_maskformer_swin_backbone(
-        self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig
-    ):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
-        ]
-
-        for layer_idx in range(len(config.backbone_config.depths)):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < 3:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Backbone + Pixel Decoder
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            self_attn_keys = []
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
-            )
-            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
-
-            return self_attn_keys
-
-        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
-            encoder_keys = []
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
-            encoder_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
-            )
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
-            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
-
-            return encoder_keys
-
-        # convolution layer for final features
-        renamed_keys = [
-            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
-            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
-            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
-                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
-                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
-            ]
-        )
-
-        # proj layers
-        for i in range(3):
-            for j in range(2):
-                renamed_keys.extend(
-                    [
-                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
-                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
-                    ]
-                )
-
-        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
-
-        # layers
-        for layer_idx in range(self.config.encoder_layers):
-            renamed_keys.extend(
-                rename_keys_for_encoder_layer(
-                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
-                )
-            )
-
-        # proj
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            ]
-        )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Transformer Decoder
-    def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        rename_keys = []
-        for i in range(self.config.decoder_layers - 1):
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.bias",
-                )
-            )
-
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias")
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.final_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.final_layer_norm.bias",
-                )
-            )
-
-        return rename_keys
-
-    def replace_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict)
-
-        # add more
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"),
-                (f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"),
-            ]
-        )
-
-        mlp_len = 3
-        for i in range(mlp_len):
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.mask_embed.layers.{i}.weight",
-                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight",
-                    ),
-                    (
-                        f"{src_prefix}.mask_embed.layers.{i}.bias",
-                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder.layers"
-        src_prefix: str = "sem_seg_head.predictor"
-        for i in range(self.config.decoder_layers - 1):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
-            )
-            in_proj_bias = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
-            )
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        self.replace_masked_attention_decoder(dst_state_dict, src_state_dict)
-
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"),
-            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
-        ]
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
-
-    def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = ""
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = [
-            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
-            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
-        ]
-
-        logger.info(f"Replacing keys {pformat(renamed_keys)}")
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, mask2former: Mask2FormerModel) -> Mask2FormerModel:
-        dst_state_dict = TrackedStateDict(mask2former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
-        mask2former.load_state_dict(state_dict)
-        return mask2former
-
-    def convert_universal_segmentation(
-        self, mask2former: Mask2FormerForUniversalSegmentation
-    ) -> Mask2FormerForUniversalSegmentation:
-        dst_state_dict = TrackedStateDict(mask2former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)
-
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
-        mask2former.load_state_dict(state_dict)
-
-        return mask2former
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-
-            # dataset_name e.g 'coco'
-            dataset_name = checkpoint.parents[2].stem
-            if dataset_name == "ade":
-                dataset_name = dataset_name.replace("ade", "ade20k")
-
-            # task type e.g 'instance-segmentation'
-            segmentation_task = checkpoint.parents[1].stem
-
-            # config file corresponding to checkpoint
-            config_file_name = f"{checkpoint.parents[0].stem}.yaml"
-
-            config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name
-            yield config, checkpoint
-
-
-def test(
-    original_model,
-    our_model: Mask2FormerForUniversalSegmentation,
-    image_processor: Mask2FormerImageProcessor,
-    tolerance: float,
-):
-    with torch.no_grad():
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-        x = image_processor(images=im, return_tensors="pt")["pixel_values"]
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-        our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
-
-        # Test backbone
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The backbone features are not the same."
-
-        # Test pixel decoder
-        mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        for original_model_feature, our_model_feature in zip(
-            multi_scale_features, our_model_output.pixel_decoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The pixel decoder feature are not the same"
-
-        # Let's test the full model
-        tr_complete = T.Compose(
-            [T.Resize((384, 384)), T.ToTensor()],
-        )
-        y = (tr_complete(im) * 255.0).to(torch.int).float()
-
-        # modify original Mask2Former code to return mask and class logits
-        original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}])
-
-        our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone())
-        our_mask_logits = our_model_out.masks_queries_logits
-        our_class_logits = our_model_out.class_queries_logits
-
-        assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
-        assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
-        assert torch.allclose(
-            original_class_logits, our_class_logits, atol=tolerance
-        ), "The class logits are not the same."
-        assert torch.allclose(
-            original_mask_logits, our_mask_logits, atol=tolerance
-        ), "The predicted masks are not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_model_name(checkpoint_file: Path):
-    # model_name_raw is something like maskformer2_swin_small_bs16_50ep
-    model_name_raw: str = checkpoint_file.parents[0].stem
-
-    # `segmentation_task_type` must be one of the following: `instance-segmentation`, `panoptic-segmentation`, `semantic-segmentation`
-    segmentation_task_name: str = checkpoint_file.parents[1].stem
-    if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]:
-        raise ValueError(
-            f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation,"
-            " panoptic-segmentation, semantic-segmentation."
-        )
-
-    # dataset name must be one of the following: `coco`, `ade`, `cityscapes`, `mapillary-vistas`
-    dataset_name: str = checkpoint_file.parents[2].stem
-    if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]:
-        raise ValueError(
-            f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'"
-            " in it "
-        )
-
-    backbone = "swin"
-    backbone_types = ["tiny", "small", "base_IN21k", "base", "large"]
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-")
-
-    model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description="Command line to convert the original mask2formers (with swin backbone) to our implementations."
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.pkl"
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.yaml"
-        ),
-    )
-    parser.add_argument(
-        "--mask2former_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to Mask2Former's original implementation directory. You can download from here:"
-            " https://github.com/facebookresearch/Mask2Former"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    mask2former_dir: Path = args.mask2former_dir
-    # append the path to the parents to mask2former dir
-    sys.path.append(str(mask2former_dir.parent))
-    # import original Mask2Former config and model from original source code repo
-    from Mask2Former.mask2former.config import add_maskformer2_config
-    from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former
-
-    for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        model_name = get_model_name(checkpoint_file)
-        image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
-            setup_cfg(Args(config_file=config_file))
-        )
-        image_processor.size = {"height": 384, "width": 384}
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        mask2former_kwargs = OriginalMask2Former.from_config(original_config)
-        original_model = OriginalMask2Former(**mask2former_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config)
-        mask2former = Mask2FormerModel(config=config).eval()
-
-        converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config)
-        mask2former = converter.convert(mask2former)
-
-        mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval()
-        mask2former_for_segmentation.model = mask2former
-
-        mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation)
-
-        tolerance = 3e-1
-        high_tolerance_models = [
-            "mask2former-swin-base-IN21k-coco-instance",
-            "mask2former-swin-base-coco-instance",
-            "mask2former-swin-small-cityscapes-semantic",
-        ]
-
-        if model_name in high_tolerance_models:
-            tolerance = 3e-1
-
-        logger.info(f"🪄 Testing {model_name}...")
-        test(original_model, mask2former_for_segmentation, image_processor, tolerance)
-        logger.info(f"🪄 Pushing {model_name} to hub...")
-
-        image_processor.push_to_hub(model_name)
-        mask2former_for_segmentation.push_to_hub(model_name)
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8b73c6824550..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,731 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.data import MetadataCatalog
-from detectron2.projects.deeplab import add_deeplab_config
-from PIL import Image
-from torch import Tensor, nn
-
-from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor
-from transformers.models.maskformer.modeling_maskformer import (
-    MaskFormerConfig,
-    MaskFormerForInstanceSegmentation,
-    MaskFormerForInstanceSegmentationOutput,
-    MaskFormerModel,
-    MaskFormerModelOutput,
-)
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by maskformer/detectron implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_mask_former_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalMaskFormerConfigToOursConverter:
-    def __call__(self, original_config: object) -> MaskFormerConfig:
-        model = original_config.MODEL
-        mask_former = model.MASK_FORMER
-        swin = model.SWIN
-
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
-        id2label = dict(enumerate(dataset_catalog.stuff_classes))
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        config: MaskFormerConfig = MaskFormerConfig(
-            fpn_feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            no_object_weight=mask_former.NO_OBJECT_WEIGHT,
-            num_queries=mask_former.NUM_OBJECT_QUERIES,
-            backbone_config={
-                "pretrain_img_size": swin.PRETRAIN_IMG_SIZE,
-                "image_size": swin.PRETRAIN_IMG_SIZE,
-                "in_channels": 3,
-                "patch_size": swin.PATCH_SIZE,
-                "embed_dim": swin.EMBED_DIM,
-                "depths": swin.DEPTHS,
-                "num_heads": swin.NUM_HEADS,
-                "window_size": swin.WINDOW_SIZE,
-                "drop_path_rate": swin.DROP_PATH_RATE,
-                "model_type": "swin",
-            },
-            dice_weight=mask_former.DICE_WEIGHT,
-            ce_weight=1.0,
-            mask_weight=mask_former.MASK_WEIGHT,
-            decoder_config={
-                "model_type": "detr",
-                "max_position_embeddings": 1024,
-                "encoder_layers": 6,
-                "encoder_ffn_dim": 2048,
-                "encoder_attention_heads": 8,
-                "decoder_layers": mask_former.DEC_LAYERS,
-                "decoder_ffn_dim": mask_former.DIM_FEEDFORWARD,
-                "decoder_attention_heads": mask_former.NHEADS,
-                "encoder_layerdrop": 0.0,
-                "decoder_layerdrop": 0.0,
-                "d_model": mask_former.HIDDEN_DIM,
-                "dropout": mask_former.DROPOUT,
-                "attention_dropout": 0.0,
-                "activation_dropout": 0.0,
-                "init_std": 0.02,
-                "init_xavier_std": 1.0,
-                "scale_embedding": False,
-                "auxiliary_loss": False,
-                "dilation": False,
-                # default pretrained config values
-            },
-            id2label=id2label,
-            label2id=label2id,
-        )
-
-        return config
-
-
-class OriginalMaskFormerConfigToImageProcessorConverter:
-    def __call__(self, original_config: object) -> MaskFormerImageProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
-
-        return MaskFormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=dataset_catalog.ignore_label,
-            size_divisibility=32,  # 32 is required by swin
-        )
-
-
-class OriginalMaskFormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: MaskFormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    def replace_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: MaskFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        self.replace_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_conv(detectron_conv: str, mine_conv: str):
-            return [
-                (f"{detectron_conv}.weight", f"{mine_conv}.0.weight"),
-                # 2 cuz the have act in the middle -> rename it
-                (f"{detectron_conv}.norm.weight", f"{mine_conv}.1.weight"),
-                (f"{detectron_conv}.norm.bias", f"{mine_conv}.1.bias"),
-            ]
-
-        renamed_keys = [
-            (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-            (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            # the layers in the original one are in reverse order, stem is the last one!
-        ]
-
-        renamed_keys.extend(rename_keys_for_conv(f"{src_prefix}.layer_4", f"{dst_prefix}.fpn.stem"))
-
-        # add all the fpn layers (here we need some config parameters to know the size in advance)
-        for src_i, dst_i in zip(range(3, 0, -1), range(0, 3)):
-            renamed_keys.extend(
-                rename_keys_for_conv(f"{src_prefix}.adapter_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.proj")
-            )
-            renamed_keys.extend(
-                rename_keys_for_conv(f"{src_prefix}.layer_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.block")
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def rename_keys_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        # not sure why we are not popping direcetly here!
-        # here we list all keys to be renamed (original name on the left, our name on the right)
-        rename_keys = []
-        for i in range(self.config.decoder_config.decoder_layers):
-            # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.self_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.self_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.bias",
-                )
-            )
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias"))
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm1.weight", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm1.bias", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm2.weight", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm2.bias", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm3.weight", f"{dst_prefix}.layers.{i}.final_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm3.bias", f"{dst_prefix}.layers.{i}.final_layer_norm.bias")
-            )
-
-        return rename_keys
-
-    def replace_q_k_v_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        for i in range(self.config.decoder_config.decoder_layers):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_weight")
-            in_proj_bias = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_bias")
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-            # read in weights + bias of input projection layer of cross-attention
-            in_proj_weight_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_weight")
-            in_proj_bias_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_bias")
-            # next, add query, keys and values (in that order) of cross-attention to the state dict
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[
-                256:512, :
-            ]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-    def replace_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        renamed_keys = self.rename_keys_in_detr_decoder(dst_state_dict, src_state_dict)
-        # add more
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.norm.weight", f"{dst_prefix}.layernorm.weight"),
-                (f"{src_prefix}.norm.bias", f"{dst_prefix}.layernorm.bias"),
-            ]
-        )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-        self.replace_q_k_v_in_detr_decoder(dst_state_dict, src_state_dict)
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        self.replace_detr_decoder(dst_state_dict, src_state_dict)
-
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.input_proj.weight", f"{dst_prefix}.input_projection.weight"),
-            (f"{src_prefix}.input_proj.bias", f"{dst_prefix}.input_projection.bias"),
-        ]
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_instance_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        # NOTE in our case we don't have a prefix, thus we removed the "." from the keys later on!
-        dst_prefix: str = ""
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = [
-            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
-            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
-        ]
-
-        mlp_len = 3
-        for i in range(mlp_len):
-            renamed_keys.extend(
-                [
-                    (f"{src_prefix}.mask_embed.layers.{i}.weight", f"{dst_prefix}mask_embedder.{i}.0.weight"),
-                    (f"{src_prefix}.mask_embed.layers.{i}.bias", f"{dst_prefix}mask_embedder.{i}.0.bias"),
-                ]
-            )
-        logger.info(f"Replacing keys {pformat(renamed_keys)}")
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, mask_former: MaskFormerModel) -> MaskFormerModel:
-        dst_state_dict = TrackedStateDict(mask_former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        mask_former.load_state_dict(dst_state_dict)
-
-        return mask_former
-
-    def convert_instance_segmentation(
-        self, mask_former: MaskFormerForInstanceSegmentation
-    ) -> MaskFormerForInstanceSegmentation:
-        dst_state_dict = TrackedStateDict(mask_former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_instance_segmentation_module(dst_state_dict, src_state_dict)
-
-        mask_former.load_state_dict(dst_state_dict)
-
-        return mask_former
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-            config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml"
-
-            yield config, checkpoint
-
-
-def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor):
-    with torch.no_grad():
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-
-        tr = T.Compose(
-            [
-                T.Resize((384, 384)),
-                T.ToTensor(),
-                T.Normalize(
-                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
-                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
-                ),
-            ],
-        )
-
-        x = tr(im).unsqueeze(0)
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-
-        our_model_output: MaskFormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
-
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=1e-3
-            ), "The backbone features are not the same."
-
-        original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        assert torch.allclose(
-            original_model_pixel_out[0], our_model_output.pixel_decoder_last_hidden_state, atol=1e-4
-        ), "The pixel decoder feature are not the same"
-
-        # let's test the full model
-        original_model_out = original_model([{"image": x.squeeze(0)}])
-
-        original_segmentation = original_model_out[0]["sem_seg"]
-
-        our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x)
-
-        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
-
-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_name(checkpoint_file: Path):
-    model_name_raw: str = checkpoint_file.stem
-    # model_name_raw is something like maskformer_panoptic_swin_base_IN21k_384_bs64_554k
-    parent_name: str = checkpoint_file.parents[0].stem
-    backbone = "swin"
-    dataset = ""
-    if "coco" in parent_name:
-        dataset = "coco"
-    elif "ade" in parent_name:
-        dataset = "ade"
-    else:
-        raise ValueError(f"{parent_name} must be wrong since we didn't find 'coco' or 'ade' in it ")
-
-    backbone_types = ["tiny", "small", "base", "large"]
-
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
-
-    model_name = f"maskformer-{backbone}-{backbone_type}-{dataset}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description="Command line to convert the original maskformers (with swin backbone) to our implementations."
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pkl\n"
-            "Given the files are in the pickle format, please be wary of passing it files you trust."
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml"
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=Path,
-        help="Path to the folder to output PyTorch models.",
-    )
-    parser.add_argument(
-        "--maskformer_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to MaskFormer's original implementation directory. You can download from here:"
-            " https://github.com/facebookresearch/MaskFormer"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    save_directory: Path = args.pytorch_dump_folder_path
-    maskformer_dir: Path = args.maskformer_dir
-    # append the path to the parents to maskformer dir
-    sys.path.append(str(maskformer_dir.parent))
-    # and import what's needed
-    from MaskFormer.mask_former import add_mask_former_config
-    from MaskFormer.mask_former.mask_former_model import MaskFormer as OriginalMaskFormer
-
-    if not save_directory.exists():
-        save_directory.mkdir(parents=True)
-
-    for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file)))
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        mask_former_kwargs = OriginalMaskFormer.from_config(original_config)
-
-        original_model = OriginalMaskFormer(**mask_former_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        config: MaskFormerConfig = OriginalMaskFormerConfigToOursConverter()(original_config)
-
-        mask_former = MaskFormerModel(config=config).eval()
-
-        converter = OriginalMaskFormerCheckpointToOursConverter(original_model, config)
-
-        maskformer = converter.convert(mask_former)
-
-        mask_former_for_instance_segmentation = MaskFormerForInstanceSegmentation(config=config).eval()
-
-        mask_former_for_instance_segmentation.model = mask_former
-        mask_former_for_instance_segmentation = converter.convert_instance_segmentation(
-            mask_former_for_instance_segmentation
-        )
-
-        test(original_model, mask_former_for_instance_segmentation, image_processor)
-
-        model_name = get_name(checkpoint_file)
-        logger.info(f"🪄 Saving {model_name}")
-
-        image_processor.save_pretrained(save_directory / model_name)
-        mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
-
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        mask_former_for_instance_segmentation.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
deleted file mode 100644
index 3ca9d9dfc3d0..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MaskFormer checkpoints with ResNet backbone from the original repository. URL:
-https://github.com/facebookresearch/MaskFormer"""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_maskformer_config(model_name: str):
-    if "resnet101c" in model_name:
-        # TODO add support for ResNet-C backbone, which uses a "deeplab" stem
-        raise NotImplementedError("To do")
-    elif "resnet101" in model_name:
-        backbone_config = ResNetConfig.from_pretrained(
-            "microsoft/resnet-101", out_features=["stage1", "stage2", "stage3", "stage4"]
-        )
-    else:
-        backbone_config = ResNetConfig.from_pretrained(
-            "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
-        )
-    config = MaskFormerConfig(backbone_config=backbone_config)
-
-    repo_id = "huggingface/label-files"
-    if "ade20k-full" in model_name:
-        config.num_labels = 847
-        filename = "maskformer-ade20k-full-id2label.json"
-    elif "ade" in model_name:
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-    elif "coco-stuff" in model_name:
-        config.num_labels = 171
-        filename = "maskformer-coco-stuff-id2label.json"
-    elif "coco" in model_name:
-        # TODO
-        config.num_labels = 133
-        filename = "coco-panoptic-id2label.json"
-    elif "cityscapes" in model_name:
-        config.num_labels = 19
-        filename = "cityscapes-id2label.json"
-    elif "vistas" in model_name:
-        config.num_labels = 65
-        filename = "mapillary-vistas-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.stem.conv1.weight", "model.pixel_level_module.encoder.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.stem.conv1.norm.weight", "model.pixel_level_module.encoder.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.stem.conv1.norm.bias", "model.pixel_level_module.encoder.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.stem.conv1.norm.running_mean", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.stem.conv1.norm.running_var", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_var"))
-    # fmt: on
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.bias",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_mean",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_var",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.bias",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_mean",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_var",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-
-    # FPN
-    # fmt: off
-    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))
-    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))
-    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
-    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))
-    # fmt: on
-
-    # Transformer decoder
-    # fmt: off
-    for idx in range(config.decoder_config.decoder_layers):
-        # self-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
-        # cross-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
-        # MLP 1
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
-        # MLP 2
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
-        # layernorm 1 (self-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
-        # layernorm 2 (cross-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
-        # layernorm 3 (final layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))
-    # fmt: on
-
-    # heads on top
-    # fmt: off
-    rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight"))
-
-    rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight"))
-    rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight"))
-    rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias"))
-
-    for i in range(3):
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_decoder_q_k_v(state_dict, config):
-    # fmt: off
-    hidden_size = config.decoder_config.hidden_size
-    for idx in range(config.decoder_config.decoder_layers):
-        # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-        # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-    # fmt: on
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_maskformer_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our MaskFormer structure.
-    """
-    config = get_maskformer_config(model_name)
-
-    # load original state_dict
-    with open(checkpoint_path, "rb") as f:
-        data = pickle.load(f)
-    state_dict = data["model"]
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # update to torch tensors
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-
-    # load 🤗 model
-    model = MaskFormerForInstanceSegmentation(config)
-    model.eval()
-
-    model.load_state_dict(state_dict)
-
-    # verify results
-    image = prepare_img()
-    if "vistas" in model_name:
-        ignore_index = 65
-    elif "cityscapes" in model_name:
-        ignore_index = 65535
-    else:
-        ignore_index = 255
-    do_reduce_labels = True if "ade" in model_name else False
-    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels)
-
-    inputs = image_processor(image, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    if model_name == "maskformer-resnet50-ade":
-        expected_logits = torch.tensor(
-            [[6.7710, -0.1452, -3.5687], [1.9165, -1.0010, -1.8614], [3.6209, -0.2950, -1.3813]]
-        )
-    elif model_name == "maskformer-resnet101-ade":
-        expected_logits = torch.tensor(
-            [[4.0381, -1.1483, -1.9688], [2.7083, -1.9147, -2.2555], [3.4367, -1.3711, -2.1609]]
-        )
-    elif model_name == "maskformer-resnet50-coco-stuff":
-        expected_logits = torch.tensor(
-            [[3.2309, -3.0481, -2.8695], [5.4986, -5.4242, -2.4211], [6.2100, -5.2279, -2.7786]]
-        )
-    elif model_name == "maskformer-resnet101-coco-stuff":
-        expected_logits = torch.tensor(
-            [[4.7188, -3.2585, -2.8857], [6.6871, -2.9181, -1.2487], [7.2449, -2.2764, -2.1874]]
-        )
-    elif model_name == "maskformer-resnet101-cityscapes":
-        expected_logits = torch.tensor(
-            [[-1.8861, -1.5465, 0.6749], [-2.3677, -1.6707, -0.0867], [-2.2314, -1.9530, -0.9132]]
-        )
-    elif model_name == "maskformer-resnet50-vistas":
-        expected_logits = torch.tensor(
-            [[-6.3917, -1.5216, -1.1392], [-5.5335, -4.5318, -1.8339], [-4.3576, -4.0301, 0.2162]]
-        )
-    elif model_name == "maskformer-resnet50-ade20k-full":
-        expected_logits = torch.tensor(
-            [[3.6146, -1.9367, -3.2534], [4.0099, 0.2027, -2.7576], [3.3913, -2.3644, -3.9519]]
-        )
-    elif model_name == "maskformer-resnet101-ade20k-full":
-        expected_logits = torch.tensor(
-            [[3.2211, -1.6550, -2.7605], [2.8559, -2.4512, -2.9574], [2.6331, -2.6775, -2.1844]]
-        )
-
-    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor of {model_name} to the hub...")
-        model.push_to_hub(f"facebook/{model_name}")
-        image_processor.push_to_hub(f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="maskformer-resnet50-ade",
-        type=str,
-        required=True,
-        choices=[
-            "maskformer-resnet50-ade",
-            "maskformer-resnet101-ade",
-            "maskformer-resnet50-coco-stuff",
-            "maskformer-resnet101-coco-stuff",
-            "maskformer-resnet101-cityscapes",
-            "maskformer-resnet50-vistas",
-            "maskformer-resnet50-ade20k-full",
-            "maskformer-resnet101-ade20k-full",
-        ],
-        help=("Name of the MaskFormer model you'd like to convert",),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="Path to the original pickle file (.pkl) of the original checkpoint.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_maskformer_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
deleted file mode 100644
index 41e8b4888810..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MaskFormer checkpoints with Swin backbone from the original repository. URL:
-https://github.com/facebookresearch/MaskFormer"""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_maskformer_config(model_name: str):
-    backbone_config = SwinConfig.from_pretrained(
-        "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-    config = MaskFormerConfig(backbone_config=backbone_config)
-
-    repo_id = "huggingface/label-files"
-    if "ade20k-full" in model_name:
-        # this should be ok
-        config.num_labels = 847
-        filename = "maskformer-ade20k-full-id2label.json"
-    elif "ade" in model_name:
-        # this should be ok
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-    elif "coco-stuff" in model_name:
-        # this should be ok
-        config.num_labels = 171
-        filename = "maskformer-coco-stuff-id2label.json"
-    elif "coco" in model_name:
-        # TODO
-        config.num_labels = 133
-        filename = "coco-panoptic-id2label.json"
-    elif "cityscapes" in model_name:
-        # this should be ok
-        config.num_labels = 19
-        filename = "cityscapes-id2label.json"
-    elif "vistas" in model_name:
-        # this should be ok
-        config.num_labels = 65
-        filename = "mapillary-vistas-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.patch_embed.proj.weight", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.proj.bias", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.patch_embed.norm.weight", "model.pixel_level_module.encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.patch_embed.norm.bias", "model.pixel_level_module.encoder.model.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.layers.{i}.downsample.reduction.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.layers.{i}.downsample.norm.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.layers.{i}.downsample.norm.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.bias"))
-        rename_keys.append((f"backbone.norm{i}.weight", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.bias"))
-
-    # FPN
-    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))
-    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))
-    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
-    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))
-
-    # Transformer decoder
-    for idx in range(config.decoder_config.decoder_layers):
-        # self-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
-        # cross-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
-        # MLP 1
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
-        # MLP 2
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
-        # layernorm 1 (self-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
-        # layernorm 2 (cross-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
-        # layernorm 3 (final layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))
-
-    # heads on top
-    rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight"))
-
-    rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight"))
-    rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight"))
-    rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias"))
-
-    for i in range(3):
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_swin_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_decoder_q_k_v(state_dict, config):
-    # fmt: off
-    hidden_size = config.decoder_config.hidden_size
-    for idx in range(config.decoder_config.decoder_layers):
-        # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-        # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-    # fmt: on
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_maskformer_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our MaskFormer structure.
-    """
-    config = get_maskformer_config(model_name)
-
-    # load original state_dict
-    with open(checkpoint_path, "rb") as f:
-        data = pickle.load(f)
-    state_dict = data["model"]
-
-    # for name, param in state_dict.items():
-    #     print(name, param.shape)
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_swin_q_k_v(state_dict, config.backbone_config)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # update to torch tensors
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-
-    # load 🤗 model
-    model = MaskFormerForInstanceSegmentation(config)
-    model.eval()
-
-    for name, param in model.named_parameters():
-        print(name, param.shape)
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == [
-        "model.pixel_level_module.encoder.model.layernorm.weight",
-        "model.pixel_level_module.encoder.model.layernorm.bias",
-    ]
-    assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}"
-
-    # verify results
-    image = prepare_img()
-    if "vistas" in model_name:
-        ignore_index = 65
-    elif "cityscapes" in model_name:
-        ignore_index = 65535
-    else:
-        ignore_index = 255
-    do_reduce_labels = True if "ade" in model_name else False
-    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels)
-
-    inputs = image_processor(image, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    print("Logits:", outputs.class_queries_logits[0, :3, :3])
-
-    if model_name == "maskformer-swin-tiny-ade":
-        expected_logits = torch.tensor(
-            [[3.6353, -4.4770, -2.6065], [0.5081, -4.2394, -3.5343], [2.1909, -5.0353, -1.9323]]
-        )
-    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and image processor to the hub...")
-        model.push_to_hub(f"nielsr/{model_name}")
-        image_processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="maskformer-swin-tiny-ade",
-        type=str,
-        help=("Name of the MaskFormer model you'd like to convert",),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/MaskFormer_checkpoints/MaskFormer-Swin-tiny-ADE20k/model.pkl",
-        type=str,
-        help="Path to the original state dict (.pth file).\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_maskformer_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
deleted file mode 100644
index eb7f00bf7710..000000000000
--- a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import MBartConfig, MBartForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-        "decoder.output_projection.weight",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_mbart_checkpoint_from_disk(
-    checkpoint_path, hf_config_path="facebook/mbart-large-en-ro", finetuned=False, mbart_50=False
-):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
-    if mbart_50 and finetuned:
-        mbart_config.activation_function = "relu"
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    model = MBartForConditionalGeneration(mbart_config)
-    model.model.load_state_dict(state_dict)
-
-    if finetuned:
-        model.lm_head = make_linear_from_emb(model.model.shared)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config",
-        default="facebook/mbart-large-cc25",
-        type=str,
-        help="Which huggingface architecture to use: mbart-large",
-    )
-    parser.add_argument("--mbart_50", action="store_true", help="whether the model is mMART-50 checkpoint")
-    parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint")
-    args = parser.parse_args()
-    model = convert_fairseq_mbart_checkpoint_from_disk(
-        args.fairseq_path, hf_config_path=args.hf_config, finetuned=args.finetuned, mbart_50=args.mbart_50
-    )
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
deleted file mode 100644
index 0fc67866301f..000000000000
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ /dev/null
@@ -1,334 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-#
-# Note: If when running this conversion script you're getting an exception:
-#     ModuleNotFoundError: No module named 'megatron.model.enums'
-# you need to tell python where to find the clone of Megatron-LM, e.g.:
-#
-# cd /tmp
-# git clone https://github.com/NVIDIA/Megatron-LM
-# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py ...
-#
-# if you already have it cloned elsewhere, simply adjust the path to the existing path
-#
-# If the training was done using a Megatron-LM fork, e.g.,
-# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
-# in your path, i.e., /path/to/Megatron-DeepSpeed/
-#
-
-import argparse
-import os
-import re
-import zipfile
-
-import torch
-
-from transformers import MegatronBertConfig
-
-
-####################################################################################################
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace BERT.
-    input_shape = param.size()
-    if checkpoint_version == 1.0:
-        # version 1.0 stores [num_heads * hidden_size * num_splits, :]
-        saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 2)
-        param = param.transpose(1, 2).contiguous()
-    elif checkpoint_version >= 2.0:
-        # other versions store [num_heads * num_splits * hidden_size, :]
-        saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-####################################################################################################
-
-
-def convert_megatron_checkpoint(args, input_state_dict, config):
-    # The converted output model.
-    output_state_dict = {}
-
-    # old versions did not store training args
-    ds_args = input_state_dict.get("args", None)
-    if ds_args is not None:
-        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
-        # from pprint import pprint
-        # pprint(vars(ds_args))
-
-        config.tokenizer_type = ds_args.tokenizer_type
-        config.vocab_size = ds_args.padded_vocab_size
-        config.max_position_embeddings = ds_args.max_position_embeddings
-        config.hidden_size = ds_args.hidden_size
-        config.num_hidden_layers = ds_args.num_layers
-        config.num_attention_heads = ds_args.num_attention_heads
-        config.intermediate_size = ds_args.ffn_hidden_size if "ffn_hidden_size" in ds_args else 4 * ds_args.hidden_size
-        # pprint(config)
-
-    # The number of heads.
-    heads = config.num_attention_heads
-    # The hidden_size per head.
-    hidden_size_per_head = config.hidden_size // heads
-    # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
-        checkpoint_version = input_state_dict["checkpoint_version"]
-    else:
-        checkpoint_version = 0.0
-
-    # The model.
-    model = input_state_dict["model"]
-    # The language model.
-    lm = model["language_model"]
-    # The embeddings.
-    embeddings = lm["embedding"]
-
-    # The word embeddings.
-    word_embeddings = embeddings["word_embeddings"]["weight"]
-    # Truncate the embedding table to vocab_size rows.
-    word_embeddings = word_embeddings[: config.vocab_size, :]
-    # Store the word embeddings.
-    output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings
-
-    # The position embeddings.
-    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    assert pos_embeddings.size(0) == config.max_position_embeddings and pos_embeddings.size(1) == config.hidden_size
-    # Store the position embeddings.
-    output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings
-
-    # The token-type embeddings.
-    tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"]
-    # Store the position embeddings.
-    output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
-
-    # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
-
-    # The regex to extract layer names.
-    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
-
-    # The simple map of names for "automated" rules.
-    megatron_to_transformers = {
-        "attention.dense": ".attention.output.dense.",
-        "self_attention.dense": ".attention.output.dense.",
-        "mlp.dense_h_to_4h": ".intermediate.dense.",
-        "mlp.dense_4h_to_h": ".output.dense.",
-    }
-
-    # Keep track of the attention/query/value tensor.
-    attention_qkv_weight = None
-
-    # Extract the layers.
-    for key, val in transformer.items():
-        # Match the name.
-        m = layer_re.match(key)
-
-        # Stop if that's not a layer
-        if m is None:
-            break
-
-        # The index of the layer.
-        layer_idx = int(m.group(1))
-        # The name of the operation.
-        op_name = m.group(2)
-        # Is it a weight or a bias?
-        weight_or_bias = m.group(3)
-
-        # The name of the layer.
-        layer_name = f"bert.encoder.layer.{layer_idx}"
-
-        # For layernorm(s), simply store the layer norm.
-        if op_name.endswith("layernorm"):
-            ln_name = "attention.ln" if op_name.startswith("input") else "ln"
-            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
-
-        # Transpose the QKV matrix.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "weight":
-            # Make sure the QKV pointer is nil.
-            assert attention_qkv_weight is None, ""
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Store the tensor as we need the bias as well to interleave QKV and biases.
-            attention_qkv_weight = out_val
-
-        # Transpose the bias.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "bias":
-            # Make sure we read the weight tensor.
-            assert attention_qkv_weight is not None, ""
-
-            # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
-            q = attention_qkv_weight[0 * config.hidden_size : 1 * config.hidden_size, :]
-            k = attention_qkv_weight[1 * config.hidden_size : 2 * config.hidden_size, :]
-            v = attention_qkv_weight[2 * config.hidden_size : 3 * config.hidden_size, :]
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Split the bias.
-            q_bias = out_val[0 * config.hidden_size : 1 * config.hidden_size]
-            k_bias = out_val[1 * config.hidden_size : 2 * config.hidden_size]
-            v_bias = out_val[2 * config.hidden_size : 3 * config.hidden_size]
-
-            # Store.
-            output_state_dict[f"{layer_name}.attention.self.query.weight"] = q
-            output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias
-            output_state_dict[f"{layer_name}.attention.self.key.weight"] = k
-            output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias
-            output_state_dict[f"{layer_name}.attention.self.value.weight"] = v
-            output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias
-
-            # Clear the stored tensor.
-            attention_qkv_weight = None
-
-        # Copy weights and biases as is.
-        elif weight_or_bias in ["weight", "bias"]:
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + weight_or_bias] = val
-
-    # The final layernorm.
-    output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"]
-    output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"]
-
-    # The pooler.
-    pooler = lm["pooler"]
-
-    # Store the matrix and the bias.
-    output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"]
-    output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"]
-
-    # The LM head from Megatron (for RACE).
-    lm_head = model["lm_head"]
-
-    # The transform matrix.
-    output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"]
-    output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"]
-
-    # The transform LN.
-    output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"]
-    output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"]
-
-    # For the decoder, we replicate the weights.
-    output_state_dict["cls.predictions.decoder.weight"] = word_embeddings
-    output_state_dict["cls.predictions.bias"] = lm_head["bias"]
-
-    # The classifier from Megatron (for MLNI).
-    binary_head = model["binary_head"]
-
-    # Store the classifier.
-    output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"]
-    output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"]
-
-    # It should be done!
-    return output_state_dict
-
-
-####################################################################################################
-
-
-def main():
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
-    parser.add_argument(
-        "--config_file",
-        default="",
-        type=str,
-        help="An optional config json file describing the pre-trained model.",
-    )
-    args = parser.parse_args()
-
-    # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
-
-    # Load the model.
-    # the .zip is very optional, let's keep it for backward compatibility
-    print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"')
-    if args.path_to_checkpoint.endswith(".zip"):
-        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
-            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
-                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
-    else:
-        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
-
-    if args.config_file == "":
-        # Default config of megatron-bert 345m
-        config = MegatronBertConfig()
-
-        # different megatron-bert-*-345m models have different vocab sizes, so override the default
-        # config (which is for megatron-bert-cased-345m) with the actual vocab dimension
-        config.vocab_size = input_state_dict["model"]["lm_head"]["bias"].numel()
-    else:
-        config = MegatronBertConfig.from_json_file(args.config_file)
-
-    # Convert.
-    print("Converting")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, output_state_dict)
-
-    # Store the config to file.
-    print("Saving config")
-    config.save_pretrained(basename)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(output_state_dict, output_checkpoint_file)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
deleted file mode 100644
index 38060f8af5c7..000000000000
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ /dev/null
@@ -1,358 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-#
-# Note: If when running this conversion script you're getting an exception:
-#     ModuleNotFoundError: No module named 'megatron.model.enums'
-# you need to tell python where to find the clone of Megatron-LM, e.g.:
-#
-# cd /tmp
-# git clone https://github.com/NVIDIA/Megatron-LM
-# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ...
-#
-# if you already have it cloned elsewhere, simply adjust the path to the existing path
-#
-# If the training was done using a Megatron-LM fork, e.g.,
-# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
-# in your path, i.e., /path/to/Megatron-DeepSpeed/
-#
-
-import argparse
-import os
-import re
-import zipfile
-
-import torch
-
-from transformers import AutoTokenizer, GPT2Config
-
-
-####################################################################################################
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace GPT2.
-    input_shape = param.size()
-    if checkpoint_version == 1.0:
-        # version 1.0 stores [num_heads * hidden_size * num_splits, :]
-        saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 2)
-        param = param.transpose(1, 2).contiguous()
-    elif checkpoint_version >= 2.0:
-        # other versions store [num_heads * num_splits * hidden_size, :]
-        saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-####################################################################################################
-
-
-def convert_megatron_checkpoint(args, input_state_dict, config):
-    # The converted output model.
-    output_state_dict = {}
-
-    # old versions did not store training args
-    ds_args = input_state_dict.get("args", None)
-    if ds_args is not None:
-        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
-        # from pprint import pprint
-        # pprint(vars(ds_args))
-
-        config.vocab_size = ds_args.padded_vocab_size
-        config.n_positions = ds_args.max_position_embeddings
-        config.n_embd = ds_args.hidden_size
-        config.n_layer = ds_args.num_layers
-        config.n_head = ds_args.num_attention_heads
-        config.n_inner = ds_args.ffn_hidden_size
-        # pprint(config)
-
-    # The number of heads.
-    heads = config.n_head
-    # The hidden_size per head.
-    hidden_size_per_head = config.n_embd // config.n_head
-    # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
-        checkpoint_version = input_state_dict["checkpoint_version"]
-    else:
-        checkpoint_version = 0.0
-
-    # The model.
-    model = input_state_dict["model"]
-    # The language model.
-    lm = model["language_model"]
-    # The embeddings.
-    embeddings = lm["embedding"]
-
-    # The word embeddings.
-    word_embeddings = embeddings["word_embeddings"]["weight"]
-    # Truncate the embedding table to vocab_size rows.
-    word_embeddings = word_embeddings[: config.vocab_size, :]
-    output_state_dict["transformer.wte.weight"] = word_embeddings
-
-    # The position embeddings.
-    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_positions = pos_embeddings.size(0)
-    if n_positions != config.n_positions:
-        raise ValueError(
-            f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
-        )
-    # Store the position embeddings.
-    output_state_dict["transformer.wpe.weight"] = pos_embeddings
-
-    # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
-
-    # The regex to extract layer names.
-    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
-
-    # The simple map of names for "automated" rules.
-    megatron_to_transformers = {
-        "attention.dense": ".attn.c_proj.",
-        "self_attention.dense": ".attn.c_proj.",
-        "mlp.dense_h_to_4h": ".mlp.c_fc.",
-        "mlp.dense_4h_to_h": ".mlp.c_proj.",
-    }
-
-    # Extract the layers.
-    for key, val in transformer.items():
-        # Match the name.
-        m = layer_re.match(key)
-
-        # Stop if that's not a layer
-        if m is None:
-            break
-
-        # The index of the layer.
-        layer_idx = int(m.group(1))
-        # The name of the operation.
-        op_name = m.group(2)
-        # Is it a weight or a bias?
-        weight_or_bias = m.group(3)
-
-        # The name of the layer.
-        layer_name = f"transformer.h.{layer_idx}"
-
-        # For layernorm(s), simply store the layer norm.
-        if op_name.endswith("layernorm"):
-            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
-            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
-
-        # Transpose the QKV matrix.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "weight":
-            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
-                1, 1, n_positions, n_positions
-            )
-            output_state_dict[layer_name + ".attn.bias"] = causal_mask
-
-            # Insert a "dummy" tensor for masked_bias.
-            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
-            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
-            out_val = out_val.transpose(0, 1).contiguous()
-            # Store.
-            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
-
-        # Transpose the bias.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "bias":
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Store. No change of shape.
-            output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
-
-        # Transpose the weights.
-        elif weight_or_bias == "weight":
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
-
-        # Copy the bias.
-        elif weight_or_bias == "bias":
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "bias"] = val
-
-    # DEBUG.
-    assert config.n_layer == layer_idx + 1
-
-    # The final layernorm.
-    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
-    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
-
-    # For LM head, transformers' wants the matrix to weight embeddings.
-    output_state_dict["lm_head.weight"] = word_embeddings
-
-    # It should be done!
-    return output_state_dict
-
-
-####################################################################################################
-
-
-def main():
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument(
-        "path_to_checkpoint",
-        type=str,
-        help="Path to the checkpoint file (.zip archive or direct .pt file)",
-    )
-    parser.add_argument(
-        "--config_file",
-        default="",
-        type=str,
-        help="An optional config json file describing the pre-trained model.",
-    )
-    args = parser.parse_args()
-
-    # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
-
-    # Load the model.
-    # the .zip is very optional, let's keep it for backward compatibility
-    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
-    if args.path_to_checkpoint.endswith(".zip"):
-        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
-            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
-                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
-    else:
-        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
-
-    ds_args = input_state_dict.get("args", None)
-
-    # Read the config, or default to the model released by NVIDIA.
-    if args.config_file == "":
-        if ds_args is not None:
-            if ds_args.bias_gelu_fusion:
-                activation_function = "gelu_fast"
-            elif ds_args.openai_gelu:
-                activation_function = "gelu_new"
-            else:
-                activation_function = "gelu"
-        else:
-            # in the very early days this used to be "gelu_new"
-            activation_function = "gelu_new"
-
-        # Spell out all parameters in case the defaults change.
-        config = GPT2Config(
-            vocab_size=50257,
-            n_positions=1024,
-            n_embd=1024,
-            n_layer=24,
-            n_head=16,
-            n_inner=4096,
-            activation_function=activation_function,
-            resid_pdrop=0.1,
-            embd_pdrop=0.1,
-            attn_pdrop=0.1,
-            layer_norm_epsilon=1e-5,
-            initializer_range=0.02,
-            summary_type="cls_index",
-            summary_use_proj=True,
-            summary_activation=None,
-            summary_proj_to_labels=True,
-            summary_first_dropout=0.1,
-            scale_attn_weights=True,
-            use_cache=True,
-            bos_token_id=50256,
-            eos_token_id=50256,
-        )
-    else:
-        config = GPT2Config.from_json_file(args.config_file)
-
-    config.architectures = ["GPT2LMHeadModel"]
-
-    # Convert.
-    print("Converting")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, output_state_dict)
-
-    # Add tokenizer class info to config
-    # see https://github.com/huggingface/transformers/issues/13906)
-    if ds_args is not None:
-        tokenizer_type = ds_args.tokenizer_type
-        if tokenizer_type == "GPT2BPETokenizer":
-            tokenizer_model_name = "openai-community/gpt2"
-        elif tokenizer_type == "PretrainedFromHF":
-            tokenizer_model_name = ds_args.tokenizer_name_or_path
-        else:
-            raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
-    else:
-        tokenizer_model_name = "openai-community/gpt2"
-
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
-    tokenizer_class = type(tokenizer).__name__
-    config.tokenizer_class = tokenizer_class
-
-    # Store the config to file.
-    print("Saving config")
-    config.save_pretrained(basename)
-
-    # Save tokenizer based on args
-    print(f"Adding {tokenizer_class} tokenizer files")
-    tokenizer.save_pretrained(basename)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(output_state_dict, output_checkpoint_file)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
deleted file mode 100644
index c617fa036c5d..000000000000
--- a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Mimi checkpoints."""
-
-import argparse
-
-import safetensors
-import torch
-
-from transformers import (
-    EncodecFeatureExtractor,
-    MimiConfig,
-    MimiModel,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.mimi")
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-convert_list = [
-    # GENERAL
-    ("conv.conv.conv", "conv"),
-    ("convtr.convtr.convtr", "conv"),
-    ("conv.conv", "conv"),
-    ("convtr.convtr", "conv"),
-    # QUANTIZER
-    ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"),
-    ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"),
-    ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"),
-    ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"),
-    ("_codebook", "codebook"),
-    ("_initialized", "initialized"),
-    ("embedding_sum", "embed_sum"),
-    # ENCODER PART
-    ("encoder.model", "encoder.layers"),
-    ("decoder.model", "decoder.layers"),
-    # TRANSFORMERS PART
-    ("encoder_transformer.transformer", "encoder_transformer"),
-    ("decoder_transformer.transformer", "decoder_transformer"),
-    ("linear1", "mlp.fc1"),
-    ("linear2", "mlp.fc2"),
-    ("self_attn.out_proj", "self_attn.o_proj"),
-    ("norm1", "input_layernorm"),
-    ("norm2", "post_attention_layernorm"),
-    ("layer_scale_1", "self_attn_layer_scale"),
-    ("layer_scale_2", "mlp_layer_scale"),
-]
-
-
-def _convert_model(
-    state_dict,
-    hf_model,
-    convert_list,
-    device,
-    config,
-    unwanted_prefix=None,
-):
-    hidden_size = config.hidden_size
-    head_dim = config.head_dim
-    num_heads = int(config.hidden_size // config.head_dim)
-    num_key_value_heads = config.num_key_value_heads
-    key_value_head_dim = config.num_key_value_heads * head_dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    for k, v in list(state_dict.items()):
-        new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        if "in_proj_weight" in new_k:
-            # split qkv into query key and value
-            mixed_qkv = state_dict.pop(k)
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads)
-            state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute(
-                key_layer, num_key_value_heads, dim1=key_value_head_dim
-            )
-            state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer
-        else:
-            state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    device = _grab_best_device()
-
-    if config_path is not None:
-        config = MimiConfig.from_pretrained(config_path)
-    else:
-        config = MimiConfig()
-
-    model = MimiModel(config)
-
-    feature_extractor = EncodecFeatureExtractor(
-        feature_size=config.audio_channels,
-        sampling_rate=config.sampling_rate,
-    )
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    original_checkpoint = safetensors.torch.load_file(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-
-    model = _convert_model(original_checkpoint, model, convert_list, device, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
deleted file mode 100644
index 1a89ade8fa6d..000000000000
--- a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-import warnings
-
-import torch
-from safetensors.torch import load_file
-
-from transformers import LlamaTokenizer, MistralConfig, MistralForCausalLM
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-# fmt: off
-STATE_DICT_MAPPING = {
-    # CausalLM keys
-    r"^output.weight":                            r"lm_head.weight",
-
-    # Model keys
-    r"^norm.weight":                              r"model.norm.weight",
-    r"^tok_embeddings.weight":                    r"model.embed_tokens.weight",
-
-    # Layers keys
-    r"^layers.(\d+).attention_norm.weight":       r"model.layers.\1.input_layernorm.weight",
-    r"^layers.(\d+).ffn_norm.weight":             r"model.layers.\1.post_attention_layernorm.weight",
-
-    # Attention keys
-    r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.layers.\1.self_attn.\2_proj.weight",
-
-
-    # MLP keys
-    r"^layers.(\d+).feed_forward.w1.weight":      r"model.layers.\1.mlp.gate_proj.weight",
-    r"^layers.(\d+).feed_forward.w2.weight":      r"model.layers.\1.mlp.down_proj.weight",
-    r"^layers.(\d+).feed_forward.w3.weight":      r"model.layers.\1.mlp.up_proj.weight",
-}
-# fmt: on
-
-
-def map_old_key_to_new(old_key):
-    """Map of a key of the original state dict to the equivalent key in HF format"""
-    for pattern, replacement in STATE_DICT_MAPPING.items():
-        new_key, n_replace = re.subn(pattern, replacement, old_key)
-        # Early exit of the loop
-        if n_replace > 0:
-            return new_key
-
-    raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def permute_for_rope(tensor, n_heads, dim1, dim2):
-    """Permute the weights for the ROPE formulation."""
-    tensor = tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-    tensor = tensor.transpose(1, 2)
-    tensor = tensor.reshape(dim1, dim2)
-    return tensor
-
-
-def convert_state_dict(original_state_dict: dict, config: MistralConfig):
-    """Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case)."""
-    new_dict = {}
-
-    n_heads = config.num_attention_heads
-    dim = config.hidden_size
-    dims_per_head = dim // n_heads
-    num_key_value_heads = config.num_key_value_heads
-    key_value_dim = dims_per_head * num_key_value_heads
-
-    for old_key, tensor in original_state_dict.items():
-        new_key = map_old_key_to_new(old_key)
-
-        if "q_proj" in new_key:
-            tensor = tensor.view(n_heads, dims_per_head, dim).reshape(dim, dim)
-            tensor = permute_for_rope(tensor, n_heads, dim, dim)
-        elif "k_proj" in new_key:
-            tensor = tensor.view(num_key_value_heads, dims_per_head, dim).reshape(key_value_dim, dim)
-            tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, dim)
-        elif "v_proj" in new_key:
-            tensor = tensor.view(num_key_value_heads, dims_per_head, dim).reshape(key_value_dim, dim)
-
-        new_dict[new_key] = tensor
-    return new_dict
-
-
-def get_concat_dim(key):
-    """Return the dimension to concatenate the weights on."""
-    concat_dim_1 = [
-        r"model.embed_tokens.weight",
-        r"model.layers.(\d+).self_attn.o_proj.weight",
-        r"model.layers.(\d+).mlp.down_proj.weight",
-    ]
-    if any(re.search(pattern, key) for pattern in concat_dim_1):
-        return 1
-    return 0
-
-
-def convert_state_dict_sharded(loaded_shards: list[dict], config: MistralConfig):
-    """Convert the state dict, when a single `nn.Module` is sharded accross different files."""
-    new_dict = {}
-
-    num_shards = len(loaded_shards)
-
-    n_heads = config.num_attention_heads
-    dim = config.hidden_size
-    dims_per_head = dim // n_heads
-    num_key_value_heads = config.num_key_value_heads
-    n_heads_per_shard = n_heads // num_shards
-    num_local_key_value_heads = num_key_value_heads // num_shards
-    key_value_dim = dim if n_heads == num_key_value_heads else dims_per_head * num_local_key_value_heads
-
-    original_keys = loaded_shards[0].keys()
-    for old_key in original_keys:
-        new_key = map_old_key_to_new(old_key)
-        cat_dim = get_concat_dim(new_key)
-
-        if "q_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(n_heads_per_shard, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(dim, dim)
-            tensor = permute_for_rope(tensor, n_heads, dim, dim)
-        elif "k_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(key_value_dim, dim)
-            tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, dim)
-        elif "v_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(key_value_dim, dim)
-        elif "input_layernorm" in new_key or "post_attention_layernorm" in new_key:
-            tensor = loaded_shards[0][old_key].clone()
-        elif "model.norm.weight" in new_key:
-            tensor = loaded_shards[0][old_key]
-        else:
-            tensor = torch.cat([shard.pop(old_key) for shard in loaded_shards], dim=cat_dim)
-
-        new_dict[new_key] = tensor
-
-    return new_dict
-
-
-def convert_config(original_config: dict, max_position_embeddings: int):
-    key_mapping = {
-        "hidden_size": "dim",
-        "num_hidden_layers": "n_layers",
-        "intermediate_size": "hidden_dim",
-        "num_attention_heads": "n_heads",
-        "rms_norm_eps": "norm_eps",
-    }
-    similar_keys_to_keep = [
-        "head_dim",
-        "vocab_size",
-    ]
-
-    new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
-    new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
-
-    # These are not always defined depending on `params.json`
-    new_config_kwargs["sliding_window"] = original_config.get("sliding_window", None)
-    new_config_kwargs["num_key_value_heads"] = original_config.get(
-        "n_kv_heads", new_config_kwargs["num_attention_heads"]
-    )
-    new_config_kwargs["rope_theta"] = original_config.get("rope_theta", 10000.0)
-
-    # This is never provided in `params.json`, we provide it manually
-    new_config_kwargs["max_position_embeddings"] = max_position_embeddings
-
-    # This may sometimes be a string in `params.json`
-    if new_config_kwargs["sliding_window"] is not None:
-        new_config_kwargs["sliding_window"] = int(new_config_kwargs["sliding_window"])
-
-    new_config = MistralConfig(**new_config_kwargs)
-    return new_config
-
-
-def convert_and_write_model(input_dir: str, output_dir: str, max_position_embeddings: int, modules_are_split: bool):
-    """Convert the model and save it (this implicitly save the config as well)."""
-    params = read_json(os.path.join(input_dir, "params.json"))
-    config = convert_config(params, max_position_embeddings)
-
-    full_state_dict = {}
-    # The model may be split between different files, but a single nn.Module is always fully present in a single file
-    if not modules_are_split:
-        shards = [file for file in os.listdir(input_dir) if file.endswith(".safetensors")]
-        for shard_file in shards:
-            original_state_dict = load_file(os.path.join(input_dir, shard_file))
-            new_dict = convert_state_dict(original_state_dict, config)
-            full_state_dict.update(new_dict)
-    # A single nn.Module is split between different checkpoint files
-    else:
-        shards = [file for file in os.listdir(input_dir) if re.match(r"consolidated.\d+.pth", file)]
-        shards = sorted(shards, key=lambda x: int(x.split(".")[1]))
-        loaded_shards = [torch.load(os.path.join(input_dir, file), map_location="cpu") for file in shards]
-        full_state_dict = convert_state_dict_sharded(loaded_shards, config)
-
-    # Load weights into model and resave them
-    with torch.device("meta"):
-        model = MistralForCausalLM(config)
-    model.load_state_dict(full_state_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-
-def convert_and_write_tokenizer(input_dir: str, output_dir: str):
-    """Convert the tokenizer and save it."""
-    # May have .v3 or .v7 at the end
-    tokenizer_file = [file for file in os.listdir(input_dir) if "tokenizer.model" in file][0]
-    tokenizer = tokenizer_class(os.path.join(input_dir, tokenizer_file))
-    tokenizer.save_pretrained(output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "input_dir",
-        help="Location of Mistral weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--max_position_embeddings",
-        type=int,
-        default=32768,
-        help="`max_position_embeddings` field in the config. This needs to be manually passed (not present anywhere otherwise).",
-    )
-    parser.add_argument(
-        "--modules_are_split",
-        action="store_true",
-        help="If passed, then the weights of a single `nn.Module` are assumed to be split between different files.",
-    )
-    parser.add_argument(
-        "--tokenizer_only",
-        action="store_true",
-        help="If passed, will only convert the tokenizer.",
-    )
-
-    args = parser.parse_args()
-
-    if not args.tokenizer_only:
-        convert_and_write_model(args.input_dir, args.output_dir, args.max_position_embeddings, args.modules_are_split)
-    convert_and_write_tokenizer(args.input_dir, args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
deleted file mode 100644
index 10b753f42248..000000000000
--- a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import torch
-
-from transformers import (
-    MixtralConfig,
-    MixtralForCausalLM,
-)
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py \
-    --input_dir /path/to/downloaded/mixtral/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import MixtralForCausalLM
-
-model = MixtralForCausalLM.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, model_size, safe_serialization=True):
-    os.makedirs(model_path, exist_ok=True)
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_shards = 1
-
-    # For some reason this is a string in the params.json
-    sliding_window = int(params["sliding_window"]) if "sliding_window" in params else None
-    n_layers = params["num_hidden_layers"]
-    n_heads = params["num_attention_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["hidden_size"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    max_position_embeddings = 4096 * 8
-    num_local_experts = params["num_local_experts"]
-    ffn_dim = params["intermediate_size"]
-
-    vocab_size = params["vocab_size"]
-
-    if "num_key_value_heads" in params:
-        num_key_value_heads = params["num_key_value_heads"]  # for GQA / MQA
-        num_local_key_value_heads = num_key_value_heads // num_shards
-        key_value_dim = dims_per_head * num_local_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_local_key_value_heads = n_heads_per_shard
-        key_value_dim = dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-    # Load weights
-    loaded = [
-        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pt"), map_location="cpu") for i in range(8)
-    ]
-
-    merged_state_dict = {}
-    for state_dict in loaded:
-        merged_state_dict.update(state_dict)
-
-    state_dict = {}
-
-    for layer_i in range(n_layers):
-        # Sharded
-        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
-        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
-        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
-
-        state_dict.update(
-            {
-                f"model.layers.{layer_i}.input_layernorm.weight": merged_state_dict[
-                    f"layers.{layer_i}.attention_norm.weight"
-                ].clone(),
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": merged_state_dict[
-                    f"layers.{layer_i}.ffn_norm.weight"
-                ].clone(),
-            }
-        )
-
-        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-            merged_state_dict[f"layers.{layer_i}.attention.wq.weight"]
-            .view(n_heads_per_shard, dims_per_head, dim)
-            .reshape(dim, dim)
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-            merged_state_dict[f"layers.{layer_i}.attention.wk.weight"]
-            .view(num_local_key_value_heads, dims_per_head, dim)
-            .reshape(key_value_dim, dim),
-            num_key_value_heads,
-            key_value_dim,
-            dim,
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = (
-            merged_state_dict[f"layers.{layer_i}.attention.wv.weight"]
-            .view(num_local_key_value_heads, dims_per_head, dim)
-            .reshape(key_value_dim, dim)
-        )
-
-        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = merged_state_dict[
-            f"layers.{layer_i}.attention.wo.weight"
-        ]
-
-        w1 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w1"]
-        w2 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w2"]
-        w3 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w3"]
-
-        experts_w1 = [
-            w1[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w1):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w1"
-            state_dict[expert_key + ".weight"] = expert_block.clone()
-
-        experts_w2 = [
-            w2[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w2):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w2"
-            state_dict[expert_key + ".weight"] = expert_block.T.clone().contiguous()
-
-        experts_w3 = [
-            w3[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w3):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w3"
-            state_dict[expert_key + ".weight"] = expert_block.clone()
-
-        state_dict[f"model.layers.{layer_i}.block_sparse_moe.gate.weight"] = merged_state_dict[
-            f"layers.{layer_i}.block_sparse_moe.gate.weight"
-        ]
-
-    state_dict.update(
-        {
-            "model.norm.weight": merged_state_dict["norm.weight"],
-            "model.embed_tokens.weight": merged_state_dict["tok_embeddings.weight"],
-            "lm_head.weight": merged_state_dict["output.weight"],
-        }
-    )
-
-    config = MixtralConfig(
-        hidden_size=dim,
-        intermediate_size=ffn_dim,
-        num_attention_heads=params["num_attention_heads"],
-        num_hidden_layers=params["num_hidden_layers"],
-        rms_norm_eps=params["rms_norm_eps"],
-        num_key_value_heads=num_key_value_heads,
-        vocab_size=vocab_size,
-        rope_theta=base,
-        max_position_embeddings=max_position_embeddings,
-        sliding_window=sliding_window,
-        num_local_experts=num_local_experts,
-    )
-
-    print("Loading the checkpoint in a Mixtral model.")
-    with torch.device("meta"):
-        model = MixtralForCausalLM(config)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    model.config.torch_dtype = torch.float16
-    print("Saving in the Transformers format.")
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    for n, p in model.named_parameters():
-        assert p.device.type != "meta", f"{n} has not been loaded!"
-
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Mixtral weights, which contains tokenizer.model and model folders",
-        required=True,
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, checkout the original repo: https://huggingface.co/mistral-ai",
-        default="7B",
-    )
-    parser.add_argument("--output_dir", help="Location to write HF model", required=True)
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        model_size=args.model_size,
-        safe_serialization=args.safe_serialization,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
deleted file mode 100644
index b2c40e27bb2b..000000000000
--- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import json
-import math
-import os
-from typing import List, Optional
-
-import regex as re
-import torch
-import torch.nn.functional as F
-
-from transformers import (
-    GenerationConfig,
-    MllamaConfig,
-    MllamaForConditionalGeneration,
-    MllamaImageProcessor,
-    PreTrainedTokenizerFast,
-)
-from transformers.convert_slow_tokenizer import TikTokenConverter
-from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
-from transformers.models.mllama.image_processing_mllama import get_all_supported_aspect_ratios
-
-
-# fmt: off
-# If a weight needs to be split in two or more keys, use `|` to indicate it. ex:
-# r"text_model.layers.(\d+).attention.wqkv.weight": r"language_model.model.layers.\1.self_attn.q|k|v|_proj.weight"
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"text_model.norm.weight":                                                                  r"language_model.model.norm.weight",
-    r"text_model.output.weight":                                                                r"language_model.lm_head.weight",
-    r"text_model.tok_embeddings":                                                               r"language_model.model.embed_tokens",
-    r"text_model.learnable_embedding":                                                          r"language_model.model.learnable_embedding",
-    r"text_model.rope.freqs":                                                                   None, # meaning we skip it and don't want it
-    # For every cross attention layer, the layer needs to be updated
-    r"text_model.cross_attention_layers.(\d+).gate_attn":                                       r"language_model.model.layers.\1.cross_attn_attn_gate",
-    r"text_model.cross_attention_layers.(\d+).gate_ffwd":                                       r"language_model.model.layers.\1.cross_attn_mlp_gate",
-    # special key, wqkv needs to be split afterwards
-    r"text_model.cross_attention_layers.(\d+).attention.w(q|k|v|o)":                            r"language_model.model.layers.\1.cross_attn.\2_proj",
-    r"text_model.cross_attention_layers.(\d+).attention.(q|k)_norm":                            r"language_model.model.layers.\1.cross_attn.\2_norm",
-    r"text_model.cross_attention_layers.(\d+).attention_norm.weight":                           r"language_model.model.layers.\1.input_layernorm.weight",
-    r"text_model.cross_attention_layers.(\d+).attention.wk.layer_norm_weight":                  r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w1.weight":                          r"language_model.model.layers.\1.mlp.gate_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w2.weight":                          r"language_model.model.layers.\1.mlp.down_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w3.weight":                          r"language_model.model.layers.\1.mlp.up_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).ffn_norm.weight":                                 r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    # self attention layers
-    r"text_model.layers.(\d+).attention.w(q|k|v|o).weight":                                     r"language_model.model.layers.\1.self_attn.\2_proj.weight",
-    r"text_model.layers.(\d+).attention_norm.weight":                                           r"language_model.model.layers.\1.input_layernorm.weight",
-    r"text_model.layers.(\d+).feed_forward.w1.":                                                r"language_model.model.layers.\1.mlp.gate_proj.",
-    r"text_model.layers.(\d+).feed_forward.w2.":                                                r"language_model.model.layers.\1.mlp.down_proj.",
-    r"text_model.layers.(\d+).feed_forward.w3.":                                                r"language_model.model.layers.\1.mlp.up_proj.",
-    r"text_model.layers.(\d+).ffn_norm.weight":                                                 r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    # Vision encoder mapping
-    r"vision_model.vision_encoder.conv1._linear":                                               r"vision_model.patch_embedding",
-    r'vision_model.vision_projection.':                                                         r"multi_modal_projector.",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wq":    r"vision_model.\1.layers.\2.self_attn.q_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wk":    r"vision_model.\1.layers.\2.self_attn.k_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wv":    r"vision_model.\1.layers.\2.self_attn.v_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wo":    r"vision_model.\1.layers.\2.self_attn.o_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).mlp.c_fc":   r"vision_model.\1.layers.\2.mlp.fc1",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).mlp.c_proj": r"vision_model.\1.layers.\2.mlp.fc2",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).ln_1":       r"vision_model.\1.layers.\2.input_layernorm",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).ln_2":       r"vision_model.\1.layers.\2.post_attention_layernorm",
-    r"vision_model.vision_encoder.global_transformer.resblocks.(\d+).(gate_ffn|gate_attn)":     r"vision_model.global_transformer.layers.\1.\2",
-    r'vision_model.vision_encoder.ln_(pre|post).(weight|bias)':                                 r'vision_model.vision_encoder.layernorm_\1.\2',
-    r'vision_model.vision_encoder.positional_embedding\b':                                      r'vision_model.gated_positional_embedding.embedding',
-    r'vision_model.vision_encoder.gated_positional_embedding\b':                                r'vision_model.gated_positional_embedding.tile_embedding.weight',
-    r'vision_model.vision_encoder.gated_positional_embedding_gate':                             r'vision_model.gated_positional_embedding.gate',
-    r"vision_model.vision_encoder.pre_tile_pos_embed.embedding":                                r"vision_model.pre_tile_positional_embedding.embedding.weight",
-    r"vision_model.vision_encoder.post_tile_pos_embed.embedding":                               r"vision_model.post_tile_positional_embedding.embedding.weight",
-    r"vision_model.vision_encoder.pre_tile_pos_embed.gate":                                     r"vision_model.pre_tile_positional_embedding.gate",
-    r"vision_model.vision_encoder.post_tile_pos_embed.gate":                                    r"vision_model.post_tile_positional_embedding.gate",
-    r"vision_model.vision_encoder.(?=\w)":                                                      r"vision_model.",
-}
-# fmt: on
-
-CONTEXT_LENGTH = 131072
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def permute_for_rope(input_tensor, n_heads, dim1, dim2):
-    """
-    When you go from the complex ROPE formulation to sin and cos one, you need
-    to permute the query and key weights (to avoid doing it on the fly)
-    """
-    input_tensor = input_tensor.reshape(dim1, dim2)
-    input_tensor = input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-    input_tensor = input_tensor.transpose(1, 2).reshape(dim1, dim2)
-    return input_tensor
-
-
-def pre_compute_positional_embedding(embedding):
-    """
-    Instead of iterating of the batch of images, and the ratios inside, we pre-compute the
-    positional embeddings depending on the aspect ratio id. This is done to support `torch.compile`
-    and efficient inference / training with different aspect ratios.
-    """
-    max_num_tiles, *shapes = embedding.shape
-    hidden_size = shapes[-1]
-    supported_aspect_ratios = get_all_supported_aspect_ratios(max_num_tiles)
-    max_aspect_ratio_id = len(supported_aspect_ratios)  # we keep 0 index for padding
-    # tile embedding does not have patches
-    num_patches = 1 if len(shapes) == 2 else shapes[1]
-    precomputed_embeddings = torch.zeros(
-        max_aspect_ratio_id + 1,
-        max_num_tiles,
-        num_patches,
-        hidden_size,
-        device=embedding.device,
-        dtype=embedding.dtype,
-    )
-
-    for i, (height, width) in enumerate(supported_aspect_ratios):
-        aspect_ratio_id = i + 1  # we keep 0 index for padding
-        current_embedding = embedding[:height, :width].reshape(height * width, num_patches, hidden_size)
-        precomputed_embeddings[aspect_ratio_id, : height * width] = current_embedding
-    precomputed_embeddings = precomputed_embeddings.flatten(1)
-    return precomputed_embeddings
-
-
-def is_param_different_across_shards(key):
-    """
-    Return `True` if the parameter is different across checkpoint shards
-    and needs to be concatenated.
-    """
-    patterns = [r"vision_model.patch_embedding.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).self_attn.(q|k|v|o)_proj.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc1.(weight|bias)",r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc2.weight",  r"multi_modal_projector.(weight|bias)",r"language_model.model.embed_tokens.weight",r"language_model.lm_head.weight",r"language_model.model.layers.(\d+).self_attn.(q|k|v|o)_proj.weight",r"language_model.model.layers.(\d+).cross_attn.(q|k|v|o)_proj.weight",r"language_model.model.layers.(\d+).mlp.(up|down|gate)_proj.weight",r"language_model.model.learnable_embedding.weight"]  # fmt: skip
-    return any(re.search(pattern, key) for pattern in patterns)
-
-
-def get_concat_dim(key):
-    """
-    Return the dimension to concatenate the weights on.
-    """
-    concat_dim_1 = [r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc2.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).self_attn.o_proj.weight",r"language_model.model.layers.(\d+).cross_attn.o_proj.weight",r"language_model.model.layers.(\d+).self_attn.o_proj.weight",r"language_model.model.layers.(\d+).mlp.down_proj.weight"]  # fmt: off
-    if any(re.search(pattern, key) for pattern in concat_dim_1):
-        return 1
-    return 0
-
-
-def compute_intermediate_size(hidden_dim, multiple_of=1024, ffn_dim_multiplier=1.3):
-    hidden_dim = 4 * int(2 * hidden_dim / 3)
-    hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-    return hidden_dim
-
-
-def interpolate_positional_embedding(
-    embeddings: torch.Tensor, vision_tile_size: int, vision_patch_size: int
-) -> torch.Tensor:
-    """
-    This method allows to interpolate the pre-trained position embeddings, to be able to use the model on higher resolution
-    images.
-    """
-    cls_embedding, positional_embedding = embeddings[:1], embeddings[1:]
-    total_num_patches, dim = positional_embedding.shape
-
-    # compute current and target number of patches for height and width
-    num_patches = int(round(total_num_patches**0.5))
-    new_num_patches = vision_tile_size // vision_patch_size
-
-    # Check if the number of patches is already the desired size
-    if num_patches == new_num_patches:
-        return embeddings
-
-    positional_embedding = positional_embedding.transpose(0, 1)
-    positional_embedding = positional_embedding.reshape(1, dim, num_patches, num_patches)
-    positional_embedding = F.interpolate(
-        positional_embedding,
-        size=(new_num_patches, new_num_patches),
-        mode="bicubic",
-        align_corners=False,
-    )
-    positional_embedding = positional_embedding.reshape(dim, -1).transpose(0, 1)
-
-    embeddings = torch.cat([cls_embedding, positional_embedding], dim=0)
-    return embeddings
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    num_shards,
-    safe_serialization=True,
-    instruct=False,
-):
-    os.makedirs(model_path, exist_ok=True)
-
-    with open(os.path.join(input_base_path, "params.json"), "r") as f:
-        params = json.load(f)
-
-    params = params.get("model", params)
-    torch_dtype = "bfloat16"
-
-    # ------------------------------------------------------------
-    # Text model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    text_vocab_size = params["vocab_size"]
-    text_num_layers = params["n_layers"]
-    text_dim = params["dim"]
-    text_num_heads = params["n_heads"]
-    text_rms_norm_eps = params["norm_eps"]
-    text_rope_theta = params["rope_theta"]
-    cross_attention_num_layers = params["vision_num_cross_attention_layers"]
-
-    # some constans from original code
-    rope_scaling = {
-        "rope_type": "llama3",
-        "factor": 8.0,
-        "low_freq_factor": 1.0,
-        "high_freq_factor": 4.0,
-        "original_max_position_embeddings": 8192,
-    }
-    max_position_embeddings = CONTEXT_LENGTH
-
-    # compute additional params for weight conversion
-    text_num_heads_per_shard = text_num_heads // num_shards
-    text_dim_per_head = text_dim // text_num_heads
-    text_intermediate_size = compute_intermediate_size(text_dim, multiple_of=params["multiple_of"])
-
-    if params.get("n_kv_heads", None) is not None:
-        text_num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        text_num_key_value_heads_per_shard = text_num_key_value_heads // num_shards
-        text_key_value_dim = text_dim_per_head * text_num_key_value_heads
-    else:  # compatibility with other checkpoints
-        text_num_key_value_heads = text_num_heads
-        text_num_key_value_heads_per_shard = text_num_heads_per_shard
-        text_key_value_dim = text_dim
-
-    # cross-attention layers: 20 for 90B, 8 for 11B
-    cross_attention_frequency = math.ceil(text_num_layers / cross_attention_num_layers)
-    text_num_total_layers = text_num_layers + cross_attention_num_layers
-    cross_attention_layers_shift = list(
-        range(cross_attention_frequency - 1, text_num_total_layers, cross_attention_frequency + 1)
-    )
-    self_attention_layers_shift = [k for k in range(text_num_total_layers) if k not in cross_attention_layers_shift]
-
-    bos_token_id = 128000
-    eos_token_id = [128001, 128008, 128009] if instruct else 128001
-    pad_token_id = 128004
-
-    text_config = MllamaTextConfig(
-        num_attention_heads=text_num_heads,
-        vocab_size=text_vocab_size,
-        hidden_size=text_dim,
-        rms_norm_eps=text_rms_norm_eps,
-        rope_theta=text_rope_theta,
-        num_hidden_layers=text_num_total_layers,
-        cross_attention_layers=cross_attention_layers_shift,
-        intermediate_size=text_intermediate_size,
-        max_position_embeddings=max_position_embeddings,
-        rope_scaling=rope_scaling,
-        bos_token_id=bos_token_id,
-        eos_token_id=eos_token_id,
-        pad_token_id=pad_token_id,
-        tie_word_embeddings=False,  # Constant set to False
-        torch_dtype=torch_dtype,
-    )
-
-    # ------------------------------------------------------------
-    # Vision model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    vision_tile_size = params["vision_chunk_size"]
-    vision_max_num_tiles = params["vision_max_num_chunks"]
-
-    # some constants from original code
-    vision_patch_size = 14
-    vision_num_channels = 3
-    vision_num_layers = 32
-    vision_num_layers_global = 8
-    vision_dim = 1280
-    vision_num_heads = 16
-    vision_intermediate_layers_indices = [3, 7, 15, 23, 30]
-
-    # compute additional params for weight conversion
-    vision_dim_per_head = vision_dim // vision_num_heads
-    vision_num_heads_per_shard = vision_num_heads // num_shards
-    vision_intermediate_size = vision_dim * 4
-    vision_supported_aspect_ratios = get_all_supported_aspect_ratios(vision_max_num_tiles)
-
-    vision_config = MllamaVisionConfig(
-        hidden_size=vision_dim,
-        patch_size=vision_patch_size,
-        num_channels=vision_num_channels,
-        intermediate_size=vision_intermediate_size,
-        num_hidden_layers=vision_num_layers,
-        num_attention_heads=vision_num_heads,
-        num_global_layers=vision_num_layers_global,
-        intermediate_layers_indices=vision_intermediate_layers_indices,
-        image_size=vision_tile_size,
-        max_num_tiles=vision_max_num_tiles,
-        supported_aspect_ratios=vision_supported_aspect_ratios,
-        torch_dtype=torch_dtype,
-    )
-
-    # save config
-    config = MllamaConfig(vision_config=vision_config, text_config=text_config, torch_dtype=torch_dtype)
-    config.architectures = ["MllamaForConditionalGeneration"]
-    config.save_pretrained(model_path)
-    print("Model config saved successfully...")
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}...")
-    if num_shards == 1:
-        if os.path.exists(os.path.join(input_base_path, "consolidated.00.pth")):
-            path = os.path.join(input_base_path, "consolidated.00.pth")
-        else:
-            path = os.path.join(input_base_path, "consolidated.pth")
-        loaded = [torch.load(path, map_location="cpu", mmap=True)]
-    else:
-        loaded = [
-            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu", mmap=True)
-            for i in range(num_shards)
-        ]
-
-    print("Converting model...")
-    all_keys = list(loaded[0].keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-
-        # In the original model, self-attention layers and cross-attention layers are different lists of layers.
-        # In the converted model, they are merged into one list with corresponding index shift to preserve the order.
-        if ("cross_attention" in key or "text_model.layers" in key) and "language_model" in new_key:
-            shift = cross_attention_layers_shift if "cross_attention" in key else self_attention_layers_shift
-            new_key = re.sub(r"layers.(\d+).", lambda _match: f"layers.{shift[int(_match.groups()[0])]}.", new_key)
-
-        current_parameter = [chunk.pop(key).contiguous().clone() for chunk in loaded]
-        if not is_param_different_across_shards(new_key):
-            current_parameter = current_parameter[0]
-
-        concat_dim = get_concat_dim(new_key)
-
-        # Post-process the current_parameter.
-        if re.search("(k|v|q)_proj.weight", new_key) and "language_model" in new_key:
-            if "q_proj" in new_key:
-                param_num_heads = text_num_heads
-                param_num_head_per_shard = text_num_heads_per_shard
-                param_dim = text_dim
-            else:
-                param_num_heads = text_num_key_value_heads
-                param_num_head_per_shard = text_num_key_value_heads_per_shard
-                param_dim = text_key_value_dim
-            shards = [param.view(param_num_head_per_shard, text_dim_per_head, text_dim) for param in current_parameter]
-            current_parameter = torch.cat(shards, dim=concat_dim)
-            if "cross_attn" not in new_key and "v_proj.weight" not in new_key:
-                current_parameter = permute_for_rope(current_parameter, param_num_heads, param_dim, text_dim)
-            state_dict[new_key] = current_parameter.reshape(param_num_heads * text_dim_per_head, text_dim)
-
-        elif "vision_model" in new_key and re.search("(k|v|q)_proj", new_key):
-            shards = [
-                param.view(vision_num_heads_per_shard, vision_dim_per_head, vision_dim) for param in current_parameter
-            ]
-            param = torch.cat(shards, dim=concat_dim)
-            state_dict[new_key] = param.reshape(vision_num_heads * vision_dim_per_head, vision_dim)
-
-        elif new_key == "vision_model.patch_embedding.weight":
-            current_parameter = torch.cat(current_parameter, dim=concat_dim)
-            state_dict[new_key] = current_parameter.reshape(
-                -1, vision_num_channels, vision_patch_size, vision_patch_size
-            )
-
-        elif new_key.endswith("gate"):
-            state_dict[new_key] = current_parameter[0].view(1)
-
-        elif "vision_model.gated_positional_embedding.embedding" in new_key:
-            current_parameter = interpolate_positional_embedding(
-                current_parameter, vision_tile_size, vision_patch_size
-            )
-            state_dict[new_key] = current_parameter
-
-        elif "vision_model.gated_positional_embedding.tile_embedding.weight" in new_key:
-            current_parameter = current_parameter.permute(2, 0, 1, 3).flatten(1)
-            current_parameter = interpolate_positional_embedding(
-                current_parameter, vision_tile_size, vision_patch_size
-            )
-            current_parameter = current_parameter.reshape(
-                -1, vision_max_num_tiles, vision_max_num_tiles, vision_dim
-            ).permute(1, 2, 0, 3)
-            state_dict[new_key] = pre_compute_positional_embedding(current_parameter)
-
-        elif "tile_positional_embedding.embedding" in new_key:
-            state_dict[new_key] = pre_compute_positional_embedding(current_parameter)
-
-        elif new_key != "":
-            if isinstance(current_parameter, list):
-                current_parameter = torch.cat(current_parameter, dim=concat_dim)
-            state_dict[new_key] = current_parameter
-
-    state_dict["language_model.model.embed_tokens.weight"] = torch.cat(
-        [
-            state_dict["language_model.model.embed_tokens.weight"],
-            state_dict.pop("language_model.model.learnable_embedding.weight"),
-        ],
-        dim=0,
-    )
-    del loaded
-    gc.collect()
-
-    print("Loading the checkpoint in a Mllama model.")
-    with torch.device("meta"):
-        model = MllamaForConditionalGeneration(config)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-    print("Checkpoint loaded successfully.")
-    del model.config._name_or_path
-
-    print("Saving the model.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    del state_dict, model
-
-    # Safety check: reload the converted model
-    gc.collect()
-    print("Reloading the model to check if it's saved correctly.")
-    MllamaForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")
-    print("Model reloaded successfully.")
-
-    # generation config
-    if instruct:
-        print("Saving generation config...")
-        generation_config = GenerationConfig(
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-        )
-        generation_config.save_pretrained(model_path)
-
-
-class MllamaConverter(TikTokenConverter):
-    def __init__(
-        self,
-        vocab_file,
-        special_tokens: List[str],
-        pattern: str,
-        model_max_length: int,
-        chat_template: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(vocab_file, pattern=pattern)
-        self.additional_special_tokens = special_tokens
-        tokenizer = self.converted()
-        if chat_template is not None:
-            kwargs["chat_template"] = chat_template
-        self.tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            model_input_names=["input_ids", "attention_mask"],
-            model_max_length=model_max_length,
-            **kwargs,
-        )
-
-
-def write_tokenizer(tokenizer_path: str, save_dir: str, instruct: bool = False):
-    model_max_length = CONTEXT_LENGTH
-    pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: W605
-
-    # Special tokens
-    num_reserved_special_tokens = 256
-    special_tokens = [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|step_id|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    special_tokens += [
-        f"<|reserved_special_token_{i + 2}|>" for i in range(num_reserved_special_tokens - len(special_tokens))
-    ]
-    # original tokenizer has <|image|> with 128011 token_id,
-    # however, later in the code it is replaced with 128256 token_id
-    special_tokens.append("<|image|>")
-
-    # Chat template
-    chat_template = (
-        "{% for message in messages %}"
-        "{% if loop.index0 == 0 %}"
-        "{{ bos_token }}"
-        "{% endif %}"
-        "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] }}"
-        "{% else %}"
-        "{% for content in message['content'] %}"
-        "{% if content['type'] == 'image' %}"
-        "{{ '<|image|>' }}"
-        "{% elif content['type'] == 'text' %}"
-        "{{ content['text'] }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        "{{ '<|eot_id|>' }}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
-        "{% endif %}"
-    )
-
-    converter = MllamaConverter(
-        vocab_file=tokenizer_path,
-        pattern=pattern,
-        special_tokens=special_tokens,
-        model_max_length=model_max_length,
-        chat_template=chat_template if instruct else None,
-        bos_token="<|begin_of_text|>",
-        eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
-        pad_token="<|finetune_right_pad_id|>",
-    )
-    tokenizer = converter.tokenizer
-    tokenizer.save_pretrained(save_dir)
-
-    if instruct:
-        print("Saving chat template...")
-        chat_template_path = os.path.join(save_dir, "chat_template.json")
-        with open(chat_template_path, "w") as f:
-            json.dump({"chat_template": chat_template}, f, indent=2)
-
-
-def write_image_processor(config_path: str, save_dir: str):
-    with open(config_path, "r") as f:
-        params = json.load(f)
-
-    tile_size = params["vision_chunk_size"]
-    max_image_tiles = params["vision_max_num_chunks"]
-
-    image_processor = MllamaImageProcessor(
-        do_resize=True,
-        size={"height": tile_size, "width": tile_size},
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_pad=True,
-        max_image_tiles=max_image_tiles,
-    )
-
-    image_processor.save_pretrained(save_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        default="Llama-3.2-11B-Vision/original",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="Llama-3.2-11B-Vision",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--special_tokens",
-        default=None,
-        type=List[str],
-        help="The list of special tokens that should be added to the model.",
-    )
-    parser.add_argument(
-        "--num_shards",
-        default=1,
-        type=int,
-        help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
-    )
-    parser.add_argument(
-        "--instruct",
-        action="store_true",
-        help="Whether the model is an instruct model",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        num_shards=args.num_shards,
-        instruct=args.instruct,
-    )
-
-    write_tokenizer(
-        tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
-        save_dir=args.output_dir,
-        instruct=args.instruct,
-    )
-
-    write_image_processor(
-        config_path=os.path.join(args.input_dir, "params.json"),
-        save_dir=args.output_dir,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index f361082fb3c5..000000000000
--- a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert mLUKE checkpoint."""
-
-import argparse
-import json
-import os
-from collections import OrderedDict
-
-import torch
-
-from transformers import LukeConfig, LukeForMaskedLM, MLukeTokenizer, XLMRobertaTokenizer
-from transformers.tokenization_utils_base import AddedToken
-
-
-@torch.no_grad()
-def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
-    # Load configuration defined in the metadata file
-    with open(metadata_path) as metadata_file:
-        metadata = json.load(metadata_file)
-    config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])
-
-    # Load in the weights from the checkpoint_path
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["module"]
-
-    # Load the entity vocab file
-    entity_vocab = load_original_entity_vocab(entity_vocab_path)
-    # add an entry for [MASK2]
-    entity_vocab["[MASK2]"] = max(entity_vocab.values()) + 1
-    config.entity_vocab_size += 1
-
-    tokenizer = XLMRobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])
-
-    # Add special tokens to the token vocabulary for downstream tasks
-    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
-    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
-    config.vocab_size += 2
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-    with open(os.path.join(pytorch_dump_folder_path, "tokenizer_config.json"), "r") as f:
-        tokenizer_config = json.load(f)
-    tokenizer_config["tokenizer_class"] = "MLukeTokenizer"
-    with open(os.path.join(pytorch_dump_folder_path, "tokenizer_config.json"), "w") as f:
-        json.dump(tokenizer_config, f)
-
-    with open(os.path.join(pytorch_dump_folder_path, MLukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
-        json.dump(entity_vocab, f)
-
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-
-    # Initialize the embeddings of the special tokens
-    ent_init_index = tokenizer.convert_tokens_to_ids(["@"])[0]
-    ent2_init_index = tokenizer.convert_tokens_to_ids(["#"])[0]
-
-    word_emb = state_dict["embeddings.word_embeddings.weight"]
-    ent_emb = word_emb[ent_init_index].unsqueeze(0)
-    ent2_emb = word_emb[ent2_init_index].unsqueeze(0)
-    state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])
-    # add special tokens for 'entity_predictions.bias'
-    for bias_name in ["lm_head.decoder.bias", "lm_head.bias"]:
-        decoder_bias = state_dict[bias_name]
-        ent_decoder_bias = decoder_bias[ent_init_index].unsqueeze(0)
-        ent2_decoder_bias = decoder_bias[ent2_init_index].unsqueeze(0)
-        state_dict[bias_name] = torch.cat([decoder_bias, ent_decoder_bias, ent2_decoder_bias])
-
-    # Initialize the query layers of the entity-aware self-attention mechanism
-    for layer_index in range(config.num_hidden_layers):
-        for matrix_name in ["query.weight", "query.bias"]:
-            prefix = f"encoder.layer.{layer_index}.attention.self."
-            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]
-
-    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
-    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
-    entity_mask_emb = entity_emb[entity_vocab["[MASK]"]].unsqueeze(0)
-    state_dict["entity_embeddings.entity_embeddings.weight"] = torch.cat([entity_emb, entity_mask_emb])
-    # add [MASK2] for 'entity_predictions.bias'
-    entity_prediction_bias = state_dict["entity_predictions.bias"]
-    entity_mask_bias = entity_prediction_bias[entity_vocab["[MASK]"]].unsqueeze(0)
-    state_dict["entity_predictions.bias"] = torch.cat([entity_prediction_bias, entity_mask_bias])
-
-    model = LukeForMaskedLM(config=config).eval()
-
-    state_dict.pop("entity_predictions.decoder.weight")
-    state_dict.pop("lm_head.decoder.weight")
-    state_dict.pop("lm_head.decoder.bias")
-    state_dict_for_hugging_face = OrderedDict()
-    for key, value in state_dict.items():
-        if not (key.startswith("lm_head") or key.startswith("entity_predictions")):
-            state_dict_for_hugging_face[f"luke.{key}"] = state_dict[key]
-        else:
-            state_dict_for_hugging_face[key] = state_dict[key]
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict_for_hugging_face, strict=False)
-
-    if set(unexpected_keys) != {"luke.embeddings.position_ids"}:
-        raise ValueError(f"Unexpected unexpected_keys: {unexpected_keys}")
-    if set(missing_keys) != {
-        "lm_head.decoder.weight",
-        "lm_head.decoder.bias",
-        "entity_predictions.decoder.weight",
-    }:
-        raise ValueError(f"Unexpected missing_keys: {missing_keys}")
-
-    model.tie_weights()
-    assert (model.luke.embeddings.word_embeddings.weight == model.lm_head.decoder.weight).all()
-    assert (model.luke.entity_embeddings.entity_embeddings.weight == model.entity_predictions.decoder.weight).all()
-
-    # Check outputs
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")
-
-    text = "ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-    span = (0, 9)
-    encoding = tokenizer(text, entity_spans=[span], return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    # Verify word hidden states
-    if model_size == "large":
-        raise NotImplementedError
-    else:  # base
-        expected_shape = torch.Size((1, 33, 768))
-        expected_slice = torch.tensor([[0.0892, 0.0596, -0.2819], [0.0134, 0.1199, 0.0573], [-0.0169, 0.0927, 0.0644]])
-
-    if not (outputs.last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
-        )
-    if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify entity hidden states
-    if model_size == "large":
-        raise NotImplementedError
-    else:  # base
-        expected_shape = torch.Size((1, 1, 768))
-        expected_slice = torch.tensor([[-0.1482, 0.0609, 0.0322]])
-
-    if not (outputs.entity_last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
-            f" {expected_shape}"
-        )
-    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify masked word/entity prediction
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-    text = "Tokyo is the capital of <mask>."
-    span = (24, 30)
-    encoding = tokenizer(text, entity_spans=[span], return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    input_ids = encoding["input_ids"][0].tolist()
-    mask_position_id = input_ids.index(tokenizer.convert_tokens_to_ids("<mask>"))
-    predicted_id = outputs.logits[0][mask_position_id].argmax(dim=-1)
-    assert "Japan" == tokenizer.decode(predicted_id)
-
-    predicted_entity_id = outputs.entity_logits[0][0].argmax().item()
-    multilingual_predicted_entities = [
-        entity for entity, entity_id in tokenizer.entity_vocab.items() if entity_id == predicted_entity_id
-    ]
-    assert [e for e in multilingual_predicted_entities if e.startswith("en:")][0] == "en:Japan"
-
-    # Finally, save our PyTorch model and tokenizer
-    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_original_entity_vocab(entity_vocab_path):
-    SPECIAL_TOKENS = ["[MASK]", "[PAD]", "[UNK]"]
-
-    data = [json.loads(line) for line in open(entity_vocab_path)]
-
-    new_mapping = {}
-    for entry in data:
-        entity_id = entry["id"]
-        for entity_name, language in entry["entities"]:
-            if entity_name in SPECIAL_TOKENS:
-                new_mapping[entity_name] = entity_id
-                break
-            new_entity_name = f"{language}:{entity_name}"
-            new_mapping[new_entity_name] = entity_id
-    return new_mapping
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
-    parser.add_argument(
-        "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
-    )
-    parser.add_argument(
-        "--entity_vocab_path",
-        default=None,
-        type=str,
-        help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
-    )
-    parser.add_argument(
-        "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
-    )
-    args = parser.parse_args()
-    convert_luke_checkpoint(
-        args.checkpoint_path,
-        args.metadata_path,
-        args.entity_vocab_path,
-        args.pytorch_dump_folder_path,
-        args.model_size,
-    )
diff --git a/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 022a9d036cdb..000000000000
--- a/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = MobileBertConfig.from_json_file(mobilebert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = MobileBertForPreTraining(config)
-    # Load weights from tf checkpoint
-    model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path)
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--mobilebert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained MobileBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 1b53bbeab475..000000000000
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileNetV1 checkpoints from the tensorflow/models library."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileNetV1Config,
-    MobileNetV1ForImageClassification,
-    MobileNetV1ImageProcessor,
-    load_tf_weights_in_mobilenet_v1,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilenet_v1_config(model_name):
-    config = MobileNetV1Config(layer_norm_eps=0.001)
-
-    if "_quant" in model_name:
-        raise ValueError("Quantized models are not supported.")
-
-    matches = re.match(r"^mobilenet_v1_([^_]*)_([^_]*)$", model_name)
-    if matches:
-        config.depth_multiplier = float(matches[1])
-        config.image_size = int(matches[2])
-
-    # The TensorFlow version of MobileNetV1 predicts 1001 classes instead of
-    # the usual 1000. The first class (index 0) is "background".
-    config.num_labels = 1001
-    filename = "imagenet-1k-id2label.json"
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k) + 1: v for k, v in id2label.items()}
-    id2label[0] = "background"
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileNetV1 structure.
-    """
-    config = get_mobilenet_v1_config(model_name)
-
-    # Load 🤗 model
-    model = MobileNetV1ForImageClassification(config).eval()
-
-    # Load weights from TensorFlow checkpoint
-    load_tf_weights_in_mobilenet_v1(model, config, checkpoint_path)
-
-    # Check outputs on an image, prepared by MobileNetV1ImageProcessor
-    image_processor = MobileNetV1ImageProcessor(
-        crop_size={"width": config.image_size, "height": config.image_size},
-        size={"shortest_edge": config.image_size + 32},
-    )
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    assert logits.shape == (1, 1001)
-
-    if model_name == "mobilenet_v1_1.0_224":
-        expected_logits = torch.tensor([-4.1739, -1.1233, 3.1205])
-    elif model_name == "mobilenet_v1_0.75_192":
-        expected_logits = torch.tensor([-3.9440, -2.3141, -0.3333])
-    else:
-        expected_logits = None
-
-    if expected_logits is not None:
-        assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        repo_id = "google/" + model_name
-        image_processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="mobilenet_v1_1.0_224",
-        type=str,
-        help="Name of the MobileNetV1 model you'd like to convert. Should in the form 'mobilenet_v1_<depth>_<size>'.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original TensorFlow checkpoint (.ckpt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 1fdb9783ccf0..000000000000
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileNetV2 checkpoints from the tensorflow/models library."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileNetV2Config,
-    MobileNetV2ForImageClassification,
-    MobileNetV2ForSemanticSegmentation,
-    MobileNetV2ImageProcessor,
-    load_tf_weights_in_mobilenet_v2,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilenet_v2_config(model_name):
-    config = MobileNetV2Config(layer_norm_eps=0.001)
-
-    if "quant" in model_name:
-        raise ValueError("Quantized models are not supported.")
-
-    matches = re.match(r"^.*mobilenet_v2_([^_]*)_([^_]*)$", model_name)
-    if matches:
-        config.depth_multiplier = float(matches[1])
-        config.image_size = int(matches[2])
-
-    if model_name.startswith("deeplabv3_"):
-        config.output_stride = 8
-        config.num_labels = 21
-        filename = "pascal-voc-id2label.json"
-    else:
-        # The TensorFlow version of MobileNetV2 predicts 1001 classes instead
-        # of the usual 1000. The first class (index 0) is "background".
-        config.num_labels = 1001
-        filename = "imagenet-1k-id2label.json"
-
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-
-    if config.num_labels == 1001:
-        id2label = {int(k) + 1: v for k, v in id2label.items()}
-        id2label[0] = "background"
-    else:
-        id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileNetV2 structure.
-    """
-    config = get_mobilenet_v2_config(model_name)
-
-    # Load 🤗 model
-    if model_name.startswith("deeplabv3_"):
-        model = MobileNetV2ForSemanticSegmentation(config).eval()
-    else:
-        model = MobileNetV2ForImageClassification(config).eval()
-
-    # Load weights from TensorFlow checkpoint
-    load_tf_weights_in_mobilenet_v2(model, config, checkpoint_path)
-
-    # Check outputs on an image, prepared by MobileNetV2ImageProcessor
-    image_processor = MobileNetV2ImageProcessor(
-        crop_size={"width": config.image_size, "height": config.image_size},
-        size={"shortest_edge": config.image_size + 32},
-    )
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    if model_name.startswith("deeplabv3_"):
-        assert logits.shape == (1, 21, 65, 65)
-
-        if model_name == "deeplabv3_mobilenet_v2_1.0_513":
-            expected_logits = torch.tensor(
-                [
-                    [[17.5790, 17.7581, 18.3355], [18.3257, 18.4230, 18.8973], [18.6169, 18.8650, 19.2187]],
-                    [[-2.1595, -2.0977, -2.3741], [-2.4226, -2.3028, -2.6835], [-2.7819, -2.5991, -2.7706]],
-                    [[4.2058, 4.8317, 4.7638], [4.4136, 5.0361, 4.9383], [4.5028, 4.9644, 4.8734]],
-                ]
-            )
-
-        else:
-            raise ValueError(f"Unknown model name: {model_name}")
-
-        assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
-    else:
-        assert logits.shape == (1, 1001)
-
-        if model_name == "mobilenet_v2_1.4_224":
-            expected_logits = torch.tensor([0.0181, -1.0015, 0.4688])
-        elif model_name == "mobilenet_v2_1.0_224":
-            expected_logits = torch.tensor([0.2445, -1.1993, 0.1905])
-        elif model_name == "mobilenet_v2_0.75_160":
-            expected_logits = torch.tensor([0.2482, 0.4136, 0.6669])
-        elif model_name == "mobilenet_v2_0.35_96":
-            expected_logits = torch.tensor([0.1451, -0.4624, 0.7192])
-        else:
-            expected_logits = None
-
-        if expected_logits is not None:
-            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        repo_id = "google/" + model_name
-        image_processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="mobilenet_v2_1.0_224",
-        type=str,
-        help="Name of the MobileNetV2 model you'd like to convert. Should in the form 'mobilenet_v2_<depth>_<size>'.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original TensorFlow checkpoint (.ckpt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
deleted file mode 100644
index 522d6671d127..000000000000
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileViT checkpoints from the ml-cvnets library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileViTConfig,
-    MobileViTForImageClassification,
-    MobileViTForSemanticSegmentation,
-    MobileViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilevit_config(mobilevit_name):
-    config = MobileViTConfig()
-
-    # size of the architecture
-    if "mobilevit_s" in mobilevit_name:
-        config.hidden_sizes = [144, 192, 240]
-        config.neck_hidden_sizes = [16, 32, 64, 96, 128, 160, 640]
-    elif "mobilevit_xs" in mobilevit_name:
-        config.hidden_sizes = [96, 120, 144]
-        config.neck_hidden_sizes = [16, 32, 48, 64, 80, 96, 384]
-    elif "mobilevit_xxs" in mobilevit_name:
-        config.hidden_sizes = [64, 80, 96]
-        config.neck_hidden_sizes = [16, 16, 24, 48, 64, 80, 320]
-        config.hidden_dropout_prob = 0.05
-        config.expand_ratio = 2.0
-
-    if mobilevit_name.startswith("deeplabv3_"):
-        config.image_size = 512
-        config.output_stride = 16
-        config.num_labels = 21
-        filename = "pascal-voc-id2label.json"
-    else:
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name, base_model=False):
-    for i in range(1, 6):
-        if f"layer_{i}." in name:
-            name = name.replace(f"layer_{i}.", f"encoder.layer.{i - 1}.")
-
-    if "conv_1." in name:
-        name = name.replace("conv_1.", "conv_stem.")
-    if ".block." in name:
-        name = name.replace(".block.", ".")
-
-    if "exp_1x1" in name:
-        name = name.replace("exp_1x1", "expand_1x1")
-    if "red_1x1" in name:
-        name = name.replace("red_1x1", "reduce_1x1")
-    if ".local_rep.conv_3x3." in name:
-        name = name.replace(".local_rep.conv_3x3.", ".conv_kxk.")
-    if ".local_rep.conv_1x1." in name:
-        name = name.replace(".local_rep.conv_1x1.", ".conv_1x1.")
-    if ".norm." in name:
-        name = name.replace(".norm.", ".normalization.")
-    if ".conv." in name:
-        name = name.replace(".conv.", ".convolution.")
-    if ".conv_proj." in name:
-        name = name.replace(".conv_proj.", ".conv_projection.")
-
-    for i in range(0, 2):
-        for j in range(0, 4):
-            if f".{i}.{j}." in name:
-                name = name.replace(f".{i}.{j}.", f".{i}.layer.{j}.")
-
-    for i in range(2, 6):
-        for j in range(0, 4):
-            if f".{i}.{j}." in name:
-                name = name.replace(f".{i}.{j}.", f".{i}.")
-                if "expand_1x1" in name:
-                    name = name.replace("expand_1x1", "downsampling_layer.expand_1x1")
-                if "conv_3x3" in name:
-                    name = name.replace("conv_3x3", "downsampling_layer.conv_3x3")
-                if "reduce_1x1" in name:
-                    name = name.replace("reduce_1x1", "downsampling_layer.reduce_1x1")
-
-    for i in range(2, 5):
-        if f".global_rep.{i}.weight" in name:
-            name = name.replace(f".global_rep.{i}.weight", ".layernorm.weight")
-        if f".global_rep.{i}.bias" in name:
-            name = name.replace(f".global_rep.{i}.bias", ".layernorm.bias")
-
-    if ".global_rep." in name:
-        name = name.replace(".global_rep.", ".transformer.")
-    if ".pre_norm_mha.0." in name:
-        name = name.replace(".pre_norm_mha.0.", ".layernorm_before.")
-    if ".pre_norm_mha.1.out_proj." in name:
-        name = name.replace(".pre_norm_mha.1.out_proj.", ".attention.output.dense.")
-    if ".pre_norm_ffn.0." in name:
-        name = name.replace(".pre_norm_ffn.0.", ".layernorm_after.")
-    if ".pre_norm_ffn.1." in name:
-        name = name.replace(".pre_norm_ffn.1.", ".intermediate.dense.")
-    if ".pre_norm_ffn.4." in name:
-        name = name.replace(".pre_norm_ffn.4.", ".output.dense.")
-    if ".transformer." in name:
-        name = name.replace(".transformer.", ".transformer.layer.")
-
-    if ".aspp_layer." in name:
-        name = name.replace(".aspp_layer.", ".")
-    if ".aspp_pool." in name:
-        name = name.replace(".aspp_pool.", ".")
-    if "seg_head." in name:
-        name = name.replace("seg_head.", "segmentation_head.")
-    if "segmentation_head.classifier.classifier." in name:
-        name = name.replace("segmentation_head.classifier.classifier.", "segmentation_head.classifier.")
-
-    if "classifier.fc." in name:
-        name = name.replace("classifier.fc.", "classifier.")
-    elif (not base_model) and ("segmentation_head." not in name):
-        name = "mobilevit." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model, base_model=False):
-    if base_model:
-        model_prefix = ""
-    else:
-        model_prefix = "mobilevit."
-
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key[:8] == "encoder.":
-            key = key[8:]
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[0][6:]) - 1
-            transformer_num = int(key_split[3])
-            layer = model.get_submodule(f"{model_prefix}encoder.layer.{layer_num}")
-            dim = layer.transformer.layer[transformer_num].attention.attention.all_head_size
-            prefix = (
-                f"{model_prefix}encoder.layer.{layer_num}.transformer.layer.{transformer_num}.attention.attention."
-            )
-            if "weight" in key:
-                orig_state_dict[prefix + "query.weight"] = val[:dim, :]
-                orig_state_dict[prefix + "key.weight"] = val[dim : dim * 2, :]
-                orig_state_dict[prefix + "value.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[prefix + "query.bias"] = val[:dim]
-                orig_state_dict[prefix + "key.bias"] = val[dim : dim * 2]
-                orig_state_dict[prefix + "value.bias"] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key, base_model)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileViT structure.
-    """
-    config = get_mobilevit_config(mobilevit_name)
-
-    # load original state_dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # load 🤗 model
-    if mobilevit_name.startswith("deeplabv3_"):
-        model = MobileViTForSemanticSegmentation(config).eval()
-    else:
-        model = MobileViTForImageClassification(config).eval()
-
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # Check outputs on an image, prepared by MobileViTImageProcessor
-    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    if mobilevit_name.startswith("deeplabv3_"):
-        assert logits.shape == (1, 21, 32, 32)
-
-        if mobilevit_name == "deeplabv3_mobilevit_s":
-            expected_logits = torch.tensor(
-                [
-                    [[6.2065, 6.1292, 6.2070], [6.1079, 6.1254, 6.1747], [6.0042, 6.1071, 6.1034]],
-                    [[-6.9253, -6.8653, -7.0398], [-7.3218, -7.3983, -7.3670], [-7.1961, -7.2482, -7.1569]],
-                    [[-4.4723, -4.4348, -4.3769], [-5.3629, -5.4632, -5.4598], [-5.1587, -5.3402, -5.5059]],
-                ]
-            )
-        elif mobilevit_name == "deeplabv3_mobilevit_xs":
-            expected_logits = torch.tensor(
-                [
-                    [[5.4449, 5.5733, 5.6314], [5.1815, 5.3930, 5.5963], [5.1656, 5.4333, 5.4853]],
-                    [[-9.4423, -9.7766, -9.6714], [-9.1581, -9.5720, -9.5519], [-9.1006, -9.6458, -9.5703]],
-                    [[-7.7721, -7.3716, -7.1583], [-8.4599, -8.0624, -7.7944], [-8.4172, -7.8366, -7.5025]],
-                ]
-            )
-        elif mobilevit_name == "deeplabv3_mobilevit_xxs":
-            expected_logits = torch.tensor(
-                [
-                    [[6.9811, 6.9743, 7.3123], [7.1777, 7.1931, 7.3938], [7.5633, 7.8050, 7.8901]],
-                    [[-10.5536, -10.2332, -10.2924], [-10.2336, -9.8624, -9.5964], [-10.8840, -10.8158, -10.6659]],
-                    [[-3.4938, -3.0631, -2.8620], [-3.4205, -2.8135, -2.6875], [-3.4179, -2.7945, -2.8750]],
-                ]
-            )
-        else:
-            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")
-
-        assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
-    else:
-        assert logits.shape == (1, 1000)
-
-        if mobilevit_name == "mobilevit_s":
-            expected_logits = torch.tensor([-0.9866, 0.2392, -1.1241])
-        elif mobilevit_name == "mobilevit_xs":
-            expected_logits = torch.tensor([-2.4761, -0.9399, -1.9587])
-        elif mobilevit_name == "mobilevit_xxs":
-            expected_logits = torch.tensor([-1.9364, -1.2327, -0.4653])
-        else:
-            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")
-
-        assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {mobilevit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_mapping = {
-            "mobilevit_s": "mobilevit-small",
-            "mobilevit_xs": "mobilevit-x-small",
-            "mobilevit_xxs": "mobilevit-xx-small",
-            "deeplabv3_mobilevit_s": "deeplabv3-mobilevit-small",
-            "deeplabv3_mobilevit_xs": "deeplabv3-mobilevit-x-small",
-            "deeplabv3_mobilevit_xxs": "deeplabv3-mobilevit-xx-small",
-        }
-
-        print("Pushing to the hub...")
-        model_name = model_mapping[mobilevit_name]
-        image_processor.push_to_hub(model_name, organization="apple")
-        model.push_to_hub(model_name, organization="apple")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--mobilevit_name",
-        default="mobilevit_s",
-        type=str,
-        help=(
-            "Name of the MobileViT model you'd like to convert. Should be one of 'mobilevit_s', 'mobilevit_xs',"
-            " 'mobilevit_xxs', 'deeplabv3_mobilevit_s', 'deeplabv3_mobilevit_xs', 'deeplabv3_mobilevit_xxs'."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.mobilevit_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
deleted file mode 100644
index 7b2f53f8d77e..000000000000
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""
-
-import argparse
-import collections
-import json
-from pathlib import Path
-
-import requests
-import torch
-import yaml
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileViTImageProcessor,
-    MobileViTV2Config,
-    MobileViTV2ForImageClassification,
-    MobileViTV2ForSemanticSegmentation,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_orig_config_file(orig_cfg_file):
-    print("Loading config file...")
-
-    def flatten_yaml_as_dict(d, parent_key="", sep="."):
-        items = []
-        for k, v in d.items():
-            new_key = parent_key + sep + k if parent_key else k
-            if isinstance(v, collections.abc.MutableMapping):
-                items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
-            else:
-                items.append((new_key, v))
-        return dict(items)
-
-    config = argparse.Namespace()
-    with open(orig_cfg_file, "r") as yaml_file:
-        try:
-            cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)
-
-            flat_cfg = flatten_yaml_as_dict(cfg)
-            for k, v in flat_cfg.items():
-                setattr(config, k, v)
-        except yaml.YAMLError as exc:
-            logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
-    return config
-
-
-def get_mobilevitv2_config(task_name, orig_cfg_file):
-    config = MobileViTV2Config()
-
-    is_segmentation_model = False
-
-    # dataset
-    if task_name.startswith("imagenet1k_"):
-        config.num_labels = 1000
-        if int(task_name.strip().split("_")[-1]) == 384:
-            config.image_size = 384
-        else:
-            config.image_size = 256
-        filename = "imagenet-1k-id2label.json"
-    elif task_name.startswith("imagenet21k_to_1k_"):
-        config.num_labels = 21000
-        if int(task_name.strip().split("_")[-1]) == 384:
-            config.image_size = 384
-        else:
-            config.image_size = 256
-        filename = "imagenet-22k-id2label.json"
-    elif task_name.startswith("ade20k_"):
-        config.num_labels = 151
-        config.image_size = 512
-        filename = "ade20k-id2label.json"
-        is_segmentation_model = True
-    elif task_name.startswith("voc_"):
-        config.num_labels = 21
-        config.image_size = 512
-        filename = "pascal-voc-id2label.json"
-        is_segmentation_model = True
-
-    # orig_config
-    orig_config = load_orig_config_file(orig_cfg_file)
-    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
-    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
-    assert (
-        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
-    ), "Norm layers other than layer_norm_2d is not supported"
-    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
-    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)
-
-    if is_segmentation_model:
-        config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
-        if "_deeplabv3" in task_name:
-            config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
-            config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
-            config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)
-
-    # id2label
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def create_rename_keys(state_dict, base_model=False):
-    if base_model:
-        model_prefix = ""
-    else:
-        model_prefix = "mobilevitv2."
-
-    rename_keys = []
-    for k in state_dict.keys():
-        if k[:8] == "encoder.":
-            k_new = k[8:]
-        else:
-            k_new = k
-
-        if ".block." in k:
-            k_new = k_new.replace(".block.", ".")
-        if ".conv." in k:
-            k_new = k_new.replace(".conv.", ".convolution.")
-        if ".norm." in k:
-            k_new = k_new.replace(".norm.", ".normalization.")
-
-        if "conv_1." in k:
-            k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.")
-        for i in [1, 2]:
-            if f"layer_{i}." in k:
-                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.")
-        if ".exp_1x1." in k:
-            k_new = k_new.replace(".exp_1x1.", ".expand_1x1.")
-        if ".red_1x1." in k:
-            k_new = k_new.replace(".red_1x1.", ".reduce_1x1.")
-
-        for i in [3, 4, 5]:
-            if f"layer_{i}.0." in k:
-                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.")
-            if f"layer_{i}.1.local_rep.0." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.")
-            if f"layer_{i}.1.local_rep.1." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.")
-
-        for i in [3, 4, 5]:
-            if i == 3:
-                j_in = [0, 1]
-            elif i == 4:
-                j_in = [0, 1, 2, 3]
-            elif i == 5:
-                j_in = [0, 1, 2]
-
-            for j in j_in:
-                if f"layer_{i}.1.global_rep.{j}." in k:
-                    k_new = k_new.replace(
-                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}."
-                    )
-            if f"layer_{i}.1.global_rep.{j+1}." in k:
-                k_new = k_new.replace(
-                    f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm."
-                )
-
-            if f"layer_{i}.1.conv_proj." in k:
-                k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
-
-        if "pre_norm_attn.0." in k:
-            k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
-        if "pre_norm_attn.1." in k:
-            k_new = k_new.replace("pre_norm_attn.1.", "attention.")
-        if "pre_norm_ffn.0." in k:
-            k_new = k_new.replace("pre_norm_ffn.0.", "layernorm_after.")
-        if "pre_norm_ffn.1." in k:
-            k_new = k_new.replace("pre_norm_ffn.1.", "ffn.conv1.")
-        if "pre_norm_ffn.3." in k:
-            k_new = k_new.replace("pre_norm_ffn.3.", "ffn.conv2.")
-
-        if "classifier.1." in k:
-            k_new = k_new.replace("classifier.1.", "classifier.")
-
-        if "seg_head." in k:
-            k_new = k_new.replace("seg_head.", "segmentation_head.")
-        if ".aspp_layer." in k:
-            k_new = k_new.replace(".aspp_layer.", ".")
-        if ".aspp_pool." in k:
-            k_new = k_new.replace(".aspp_pool.", ".")
-
-        rename_keys.append((k, k_new))
-    return rename_keys
-
-
-def remove_unused_keys(state_dict):
-    """remove unused keys (e.g.: seg_head.aux_head)"""
-    keys_to_ignore = []
-    for k in state_dict.keys():
-        if k.startswith("seg_head.aux_head."):
-            keys_to_ignore.append(k)
-    for k in keys_to_ignore:
-        state_dict.pop(k, None)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our MobileViTV2 structure.
-    """
-    config = get_mobilevitv2_config(task_name, orig_config_path)
-
-    # load original state_dict
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    # load huggingface model
-    if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
-        model = MobileViTV2ForSemanticSegmentation(config).eval()
-        base_model = False
-    else:
-        model = MobileViTV2ForImageClassification(config).eval()
-        base_model = False
-
-    # remove and rename some keys of load the original model
-    state_dict = checkpoint
-    remove_unused_keys(state_dict)
-    rename_keys = create_rename_keys(state_dict, base_model=base_model)
-    for rename_key_src, rename_key_dest in rename_keys:
-        rename_key(state_dict, rename_key_src, rename_key_dest)
-
-    # load modified state_dict
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by MobileViTImageProcessor
-    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-
-    # verify classification model
-    if task_name.startswith("imagenet"):
-        logits = outputs.logits
-        predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", model.config.id2label[predicted_class_idx])
-        if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
-            # expected_logits for base variant
-            expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
-            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--task",
-        default="imagenet1k_256",
-        type=str,
-        help=(
-            "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
-            """
-                Classification (ImageNet-1k)
-                    - MobileViTV2 (256x256) : imagenet1k_256
-                    - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
-                    - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
-                      imagenet21k_to_1k_256
-                    - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
-                      ImageNet-1k 384x384) : imagenet21k_to_1k_384
-                Segmentation
-                    - ADE20K Dataset : ade20k_deeplabv3
-                    - Pascal VOC 2012 Dataset: voc_deeplabv3
-            """
-        ),
-        choices=[
-            "imagenet1k_256",
-            "imagenet1k_384",
-            "imagenet21k_to_1k_256",
-            "imagenet21k_to_1k_384",
-            "ade20k_deeplabv3",
-            "voc_deeplabv3",
-        ],
-    )
-
-    parser.add_argument(
-        "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
-    )
-    parser.add_argument(
-        "--orig_config_path",
-        required=True,
-        type=str,
-        help="Path to the original config file. yaml.load will be used to load the file, please be wary of which file you're loading.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_mobilevitv2_checkpoint(
-        args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/moshi/convert_moshi_transformers.py b/src/transformers/models/moshi/convert_moshi_transformers.py
deleted file mode 100644
index 1caaee25ef6f..000000000000
--- a/src/transformers/models/moshi/convert_moshi_transformers.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Moshi checkpoints."""
-
-import argparse
-
-import safetensors
-import sentencepiece
-import torch
-
-from transformers import (
-    AutoFeatureExtractor,
-    GenerationConfig,
-    MimiModel,  # initial audio encoder
-    MoshiConfig,
-    MoshiForConditionalGeneration,
-    PreTrainedTokenizerFast,
-    logging,
-)
-from transformers.convert_slow_tokenizer import MoshiConverter
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.mimi")
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-convert_list = [
-    # GENERAL
-    ("out_norm", "decoder.model.norm"),
-    ("depformer_emb", "depth_decoder.emb"),
-    ("depformer_text_emb", "depth_decoder.text_emb"),
-    ("text_emb", "decoder.model.emb"),
-    ("emb", "embed_tokens"),
-    ("text_linear", "decoder.lm_head"),
-    ("depformer", "depth_decoder"),
-    ("transformer", "decoder.model"),
-    # TRANSFORMERS PART
-    ("gating.linear_in", "mlp.fc1"),
-    ("gating.linear_out", "mlp.fc2"),
-    ("self_attn.out_proj", "self_attn.o_proj.linear"),
-    ("norm1", "input_layernorm"),
-    ("norm2", "post_attention_layernorm"),
-    ("layer_scale_1", "self_attn_layer_scale"),
-    ("layer_scale_2", "mlp_layer_scale"),
-    ("alpha", "weight"),
-]
-
-
-def _preprocess_state_dict(state_dict, config):
-    # Moshi original weights are using a gating mechanism
-
-    # pattern for depth transformer:
-    # stack(gating.{i}.linear_in)->mlp.fc1
-    # stack(gating.{i}.linear_out)->mlp.fc2
-
-    for layer_idx in range(config.depth_decoder_config.num_hidden_layers):
-        linear_layers_in = [
-            state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_in.weight")
-            for i in range(config.num_codebooks)
-        ]
-        linear_layers_out = [
-            state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_out.weight")
-            for i in range(config.num_codebooks)
-        ]
-
-        state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc1.weight"] = torch.stack(linear_layers_in)
-        state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc2.weight"] = torch.stack(linear_layers_out)
-
-    input_projections = []
-    lm_heads = []
-    for codebook_idx in range(config.num_codebooks):
-        input_projections.append(state_dict.pop(f"depformer_in.{codebook_idx}.weight"))
-        lm_heads.append(state_dict.pop(f"linears.{codebook_idx}.weight"))
-
-    state_dict["depth_decoder.input_projections.weight"] = torch.stack(input_projections, dim=0)
-    state_dict["depth_decoder.lm_heads.weight"] = torch.stack(lm_heads, dim=0)
-
-    return state_dict
-
-
-def _convert_model(
-    state_dict,
-    hf_model,
-    convert_list,
-    device,
-    config,
-    unwanted_prefix=None,
-):
-    hidden_size = config.hidden_size
-    head_dim = config.head_dim
-    num_heads = int(config.hidden_size // config.head_dim)
-    num_key_value_heads = config.num_key_value_heads
-    key_value_head_dim = config.num_key_value_heads * head_dim
-
-    state_dict = _preprocess_state_dict(state_dict, config)
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    for k, v in list(state_dict.items()):
-        if "audio_encoder" not in k:
-            new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
-            for old_layer_name, new_layer_name in convert_list:
-                if old_layer_name in new_k:
-                    new_k = new_k.replace(old_layer_name, new_layer_name)
-
-            if "alpha" in k:
-                state_dict[k] = state_dict[k].squeeze()
-
-            if "in_proj_weight" in new_k:
-                # split qkv into query key and value
-                mixed_qkv = state_dict.pop(k)
-                if "depth_decoder" in new_k:
-                    mixed_qkv = mixed_qkv.view(config.num_codebooks, -1, mixed_qkv.shape[-1])
-
-                    qkv_dim = mixed_qkv.size(1) // 3
-
-                    query_layer = mixed_qkv[:, :qkv_dim]
-                    key_layer = mixed_qkv[:, qkv_dim : qkv_dim * 2]
-                    value_layer = mixed_qkv[:, qkv_dim * 2 :]
-                    state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = query_layer
-                    state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = key_layer
-
-                else:
-                    qkv_dim = mixed_qkv.size(0) // 3
-
-                    query_layer = mixed_qkv[:qkv_dim]
-                    key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-                    value_layer = mixed_qkv[qkv_dim * 2 :]
-                    state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = permute(
-                        query_layer, num_heads, hidden_size, hidden_size
-                    )
-                    state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = permute(
-                        key_layer, num_key_value_heads, key_value_head_dim, hidden_size
-                    )
-
-                state_dict[new_k.replace("in_proj_weight", "v_proj.linear.weight")] = value_layer
-            elif "o_proj" in new_k and "depth_decoder" in new_k:
-                output_layer = state_dict.pop(k)
-                state_dict[new_k] = output_layer.view(config.num_codebooks, -1, output_layer.shape[-1])
-            else:
-                state_dict[new_k] = state_dict.pop(k)
-
-    # Do the last one by hand
-    state_dict["depth_decoder.text_embed_tokens.weight"] = state_dict.pop(
-        "depth_decoder.decoder.model.embed_tokens.weight"
-    )
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    mimi_repo_id,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    device = _grab_best_device()
-
-    mimi_model = MimiModel.from_pretrained(mimi_repo_id, torch_dtype=torch.bfloat16)
-
-    if config_path is not None:
-        config = MoshiConfig.from_pretrained(config_path)
-    else:
-        audio_encoder_config = mimi_model.config
-        config = MoshiConfig.from_audio_encoder_config(audio_encoder_config)
-
-    model = MoshiForConditionalGeneration(config).to(torch.bfloat16)
-
-    depth_decoder_generation_config = GenerationConfig(
-        do_sample=True,
-        temperature=0.8,
-        top_k=250,
-        min_length=config.num_codebooks + 1,
-        max_length=config.num_codebooks + 1,
-        cache_implementation="sliding_window",
-    )
-
-    generation_config = GenerationConfig(
-        do_sample=True,
-        temp=0.7,
-        top_k=25,
-        cache_implementation="sliding_window",
-        pad_token_id=config.vocab_size,
-        bos_token_id=config.vocab_size,
-    )
-    generation_config.depth_decoder_config = depth_decoder_generation_config.to_diff_dict()
-
-    model.generation_config = generation_config
-
-    original_checkpoint = safetensors.torch.load_file(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-
-    audio_checkpoint = mimi_model.state_dict()
-    original_checkpoint.update({f"audio_encoder.{key}": value for (key, value) in audio_checkpoint.items()})
-
-    model = _convert_model(original_checkpoint, model, convert_list, device, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--tokenizer_vocab_path", required=False, default=None, type=str, help="Path to original tokenizer vocab file"
-    )
-    parser.add_argument("--mimi_repo_id", required=True, default=None, type=str, help="Repository id to HF Mimi.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-
-    # convert tokenizer
-    if args.tokenizer_vocab_path:
-        original_tokenizer = sentencepiece.SentencePieceProcessor(args.tokenizer_vocab_path)
-        tokenizer = MoshiConverter(args.tokenizer_vocab_path).converted()
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            chat_template=None,
-            unk_token="<unk>",
-            model_input_names=["input_ids", "attention_mask"],
-            clean_up_tokenization_spaces=False,
-            bos_token_id=original_tokenizer.bos_id(),
-            eos_token_id=original_tokenizer.eos_id(),
-            pad_token_id=original_tokenizer.pad_id(),
-        )
-
-        tokenizer.save_pretrained(args.pytorch_dump_folder_path)
-
-        if args.push_to_hub:
-            print("Pushing the tokenizer to the hub...")
-            tokenizer.push_to_hub(args.push_to_hub)
-
-    # upload feature extractor
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.mimi_repo_id)
-    feature_extractor.save_pretrained(args.pytorch_dump_folder_path)
-
-    if args.push_to_hub:
-        print("Pushing the feature extractor to the hub...")
-        feature_extractor.push_to_hub(args.push_to_hub)
-
-    convert_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.mimi_repo_id,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
deleted file mode 100644
index f558f7c7bce3..000000000000
--- a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
-
-import argparse
-
-import torch
-
-from transformers import MraConfig, MraForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff.0" in orig_key:
-        orig_key = orig_key.replace("ff.0", "intermediate.dense")
-    if "ff.2" in orig_key:
-        orig_key = orig_key.replace("ff.2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "backbone.backbone.encoders" in orig_key:
-        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
-    if "cls" not in orig_key:
-        orig_key = "mra." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
-
-    return orig_state_dict
-
-
-def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = MraConfig.from_json_file(mra_config_file)
-    model = MraForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
-
-    print(model.load_state_dict(new_state_dict))
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for Mra model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
deleted file mode 100644
index f4afd24df009..000000000000
--- a/src/transformers/models/musicgen/convert_musicgen_transformers.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MusicGen checkpoints from the original repository."""
-
-import argparse
-from pathlib import Path
-from typing import Dict, OrderedDict, Tuple
-
-import torch
-from audiocraft.models import MusicGen
-
-from transformers import (
-    AutoFeatureExtractor,
-    AutoTokenizer,
-    EncodecModel,
-    MusicgenDecoderConfig,
-    MusicgenForConditionalGeneration,
-    MusicgenProcessor,
-    T5EncoderModel,
-)
-from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
-
-
-def rename_keys(name):
-    if "emb" in name:
-        name = name.replace("emb", "model.decoder.embed_tokens")
-    if "transformer" in name:
-        name = name.replace("transformer", "model.decoder")
-    if "cross_attention" in name:
-        name = name.replace("cross_attention", "encoder_attn")
-    if "linear1" in name:
-        name = name.replace("linear1", "fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "fc2")
-    if "norm1" in name:
-        name = name.replace("norm1", "self_attn_layer_norm")
-    if "norm_cross" in name:
-        name = name.replace("norm_cross", "encoder_attn_layer_norm")
-    if "norm2" in name:
-        name = name.replace("norm2", "final_layer_norm")
-    if "out_norm" in name:
-        name = name.replace("out_norm", "model.decoder.layer_norm")
-    if "linears" in name:
-        name = name.replace("linears", "lm_heads")
-    if "condition_provider.conditioners.description.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
-    return name
-
-
-def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
-    """Function that takes the fairseq Musicgen state dict and renames it according to the HF
-    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
-    encoder-decoder projection."""
-    keys = list(state_dict.keys())
-    enc_dec_proj_state_dict = {}
-    for key in keys:
-        val = state_dict.pop(key)
-        key = rename_keys(key)
-        if "in_proj_weight" in key:
-            # split fused qkv proj
-            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
-        elif "enc_to_dec_proj" in key:
-            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
-        else:
-            state_dict[key] = val
-    return state_dict, enc_dec_proj_state_dict
-
-
-def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
-    if checkpoint.endswith("small"):
-        # default config values
-        hidden_size = 1024
-        num_hidden_layers = 24
-        num_attention_heads = 16
-    elif checkpoint.endswith("medium"):
-        hidden_size = 1536
-        num_hidden_layers = 48
-        num_attention_heads = 24
-    elif checkpoint.endswith("large"):
-        hidden_size = 2048
-        num_hidden_layers = 48
-        num_attention_heads = 32
-    else:
-        raise ValueError(
-            "Checkpoint should be one of `['small', 'medium', 'large']` for the mono checkpoints, "
-            "`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
-            f"for the stereo checkpoints, or a custom checkpoint with the checkpoint size as a suffix, got {checkpoint}."
-        )
-
-    if "stereo" in checkpoint:
-        audio_channels = 2
-        num_codebooks = 8
-    else:
-        audio_channels = 1
-        num_codebooks = 4
-
-    config = MusicgenDecoderConfig(
-        hidden_size=hidden_size,
-        ffn_dim=hidden_size * 4,
-        num_hidden_layers=num_hidden_layers,
-        num_attention_heads=num_attention_heads,
-        num_codebooks=num_codebooks,
-        audio_channels=audio_channels,
-    )
-    return config
-
-
-@torch.no_grad()
-def convert_musicgen_checkpoint(
-    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", safe_serialization=False
-):
-    fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
-    decoder_config = decoder_config_from_checkpoint(checkpoint)
-
-    decoder_state_dict = fairseq_model.lm.state_dict()
-    decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
-        decoder_state_dict, hidden_size=decoder_config.hidden_size
-    )
-
-    text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base")
-    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
-    decoder = MusicgenForCausalLM(decoder_config).eval()
-
-    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
-    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
-
-    for key in missing_keys.copy():
-        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
-            missing_keys.remove(key)
-
-    if len(missing_keys) > 0:
-        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
-
-    if len(unexpected_keys) > 0:
-        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
-
-    # init the composite model
-    model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)
-
-    # load the pre-trained enc-dec projection (from the decoder state dict)
-    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
-
-    # check we can do a forward pass
-    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1)
-    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1)
-
-    with torch.no_grad():
-        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-    if logits.shape != (2 * decoder_config.num_codebooks, 1, 2048):
-        raise ValueError("Incorrect shape for logits")
-
-    # now construct the processor
-    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        "facebook/encodec_32khz", padding_side="left", feature_size=decoder_config.audio_channels
-    )
-
-    processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
-    # set the appropriate bos/pad token ids
-    model.generation_config.decoder_start_token_id = 2048
-    model.generation_config.pad_token_id = 2048
-
-    # set other default generation config params
-    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
-    model.generation_config.do_sample = True
-    model.generation_config.guidance_scale = 3.0
-
-    if pytorch_dump_folder is not None:
-        Path(pytorch_dump_folder).mkdir(exist_ok=True)
-        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
-        model.save_pretrained(pytorch_dump_folder, safe_serialization=safe_serialization)
-        processor.save_pretrained(pytorch_dump_folder)
-
-    if repo_id:
-        logger.info(f"Pushing model {checkpoint} to {repo_id}")
-        model.push_to_hub(repo_id, safe_serialization=safe_serialization)
-        processor.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint",
-        default="small",
-        type=str,
-        help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: "
-        "`['small', 'medium', 'large']` for the mono checkpoints, "
-        "`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
-        "for the stereo checkpoints, or a custom checkpoint with the checkpoint size as a suffix.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder",
-        required=True,
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument(
-        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument(
-        "--safe_serialization",
-        action="store_true",
-        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).",
-    )
-
-    args = parser.parse_args()
-    convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)
diff --git a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py b/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
deleted file mode 100644
index 52980f73ecdb..000000000000
--- a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Musicgen Melody checkpoints from the original repository."""
-
-import argparse
-from pathlib import Path
-from typing import Dict, OrderedDict, Tuple
-
-import torch
-from audiocraft.models import MusicGen
-
-from transformers import (
-    AutoTokenizer,
-    EncodecModel,
-    T5EncoderModel,
-)
-from transformers.models.musicgen_melody.configuration_musicgen_melody import MusicgenMelodyDecoderConfig
-from transformers.models.musicgen_melody.feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
-from transformers.models.musicgen_melody.modeling_musicgen_melody import (
-    MusicgenMelodyForCausalLM,
-    MusicgenMelodyForConditionalGeneration,
-)
-from transformers.models.musicgen_melody.processing_musicgen_melody import MusicgenMelodyProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
-EXPECTED_ADDITIONAL_KEYS = ["condition_provider.conditioners.self_wav.chroma.spec.window"]
-
-
-def rename_keys(name):
-    if "emb" in name:
-        name = name.replace("emb", "model.decoder.embed_tokens")
-    if "transformer" in name:
-        name = name.replace("transformer", "model.decoder")
-    if "cross_attention" in name:
-        name = name.replace("cross_attention", "encoder_attn")
-    if "linear1" in name:
-        name = name.replace("linear1", "fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "fc2")
-    if "norm1" in name:
-        name = name.replace("norm1", "self_attn_layer_norm")
-    if "norm_cross" in name:
-        name = name.replace("norm_cross", "encoder_attn_layer_norm")
-    if "norm2" in name:
-        name = name.replace("norm2", "final_layer_norm")
-    if "out_norm" in name:
-        name = name.replace("out_norm", "model.decoder.layer_norm")
-    if "linears" in name:
-        name = name.replace("linears", "lm_heads")
-    if "condition_provider.conditioners.description.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
-    if "condition_provider.conditioners.self_wav.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.self_wav.output_proj", "audio_enc_to_dec_proj")
-    return name
-
-
-def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
-    """Function that takes the fairseq MusicgenMelody state dict and renames it according to the HF
-    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
-    text encoder projection and for the audio encoder projection."""
-    keys = list(state_dict.keys())
-    enc_dec_proj_state_dict = {}
-    audio_enc_to_dec_proj_state_dict = {}
-    for key in keys:
-        val = state_dict.pop(key)
-        key = rename_keys(key)
-        if "in_proj_weight" in key:
-            # split fused qkv proj
-            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
-        elif "audio_enc_to_dec_proj" in key:
-            audio_enc_to_dec_proj_state_dict[key[len("audio_enc_to_dec_proj.") :]] = val
-        elif "enc_to_dec_proj" in key:
-            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
-        else:
-            state_dict[key] = val
-    return state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict
-
-
-def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenMelodyDecoderConfig:
-    if checkpoint == "facebook/musicgen-melody" or checkpoint == "facebook/musicgen-stereo-melody":
-        hidden_size = 1536
-        num_hidden_layers = 48
-        num_attention_heads = 24
-    elif checkpoint == "facebook/musicgen-melody-large" or checkpoint == "facebook/musicgen-stereo-melody-large":
-        hidden_size = 2048
-        num_hidden_layers = 48
-        num_attention_heads = 32
-    else:
-        raise ValueError(
-            "Checkpoint should be one of `['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, "
-            "or `['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
-            f"for the stereo checkpoints, got {checkpoint}."
-        )
-
-    if "stereo" in checkpoint:
-        audio_channels = 2
-        num_codebooks = 8
-    else:
-        audio_channels = 1
-        num_codebooks = 4
-
-    config = MusicgenMelodyDecoderConfig(
-        hidden_size=hidden_size,
-        ffn_dim=hidden_size * 4,
-        num_hidden_layers=num_hidden_layers,
-        num_attention_heads=num_attention_heads,
-        num_codebooks=num_codebooks,
-        audio_channels=audio_channels,
-    )
-    return config
-
-
-@torch.no_grad()
-def convert_musicgen_melody_checkpoint(
-    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", test_same_output=False
-):
-    fairseq_model = MusicGen.get_pretrained(checkpoint, device=args.device)
-    decoder_config = decoder_config_from_checkpoint(checkpoint)
-
-    decoder_state_dict = fairseq_model.lm.state_dict()
-    decoder_state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict = rename_state_dict(
-        decoder_state_dict, hidden_size=decoder_config.hidden_size
-    )
-
-    text_encoder = T5EncoderModel.from_pretrained("t5-base")
-    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
-    decoder = MusicgenMelodyForCausalLM(decoder_config).eval()
-
-    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
-    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
-
-    for key in missing_keys.copy():
-        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
-            missing_keys.remove(key)
-
-    for key in unexpected_keys.copy():
-        if key in EXPECTED_ADDITIONAL_KEYS:
-            unexpected_keys.remove(key)
-
-    if len(missing_keys) > 0:
-        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
-
-    if len(unexpected_keys) > 0:
-        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
-
-    # init the composite model
-    model = MusicgenMelodyForConditionalGeneration(
-        text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder
-    ).to(args.device)
-
-    # load the pre-trained enc-dec projection (from the decoder state dict)
-    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
-
-    # load the pre-trained audio encoder projection (from the decoder state dict)
-    model.audio_enc_to_dec_proj.load_state_dict(audio_enc_to_dec_proj_state_dict)
-
-    # check we can do a forward pass
-    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1).to(device)
-    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1).to(device)
-
-    with torch.no_grad():
-        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-    output_length = 1 + input_ids.shape[1] + model.config.chroma_length
-    if logits.shape != (2 * decoder_config.num_codebooks, output_length, 2048):
-        raise ValueError("Incorrect shape for logits")
-
-    # now construct the processor
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-    feature_extractor = MusicgenMelodyFeatureExtractor()
-
-    processor = MusicgenMelodyProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
-    # set the appropriate bos/pad token ids
-    model.generation_config.decoder_start_token_id = 2048
-    model.generation_config.pad_token_id = 2048
-
-    # set other default generation config params
-    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
-    model.generation_config.do_sample = True
-    model.generation_config.guidance_scale = 3.0
-
-    if test_same_output:
-        # check same output than original model
-        decoder_input_ids = torch.ones_like(decoder_input_ids).to(device) * model.generation_config.pad_token_id
-        with torch.no_grad():
-            decoder_input_ids = decoder_input_ids[: decoder_config.num_codebooks]
-            inputs = processor(text=["gen"], return_tensors="pt", padding=True).to(device)
-            logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
-
-            attributes, prompt_tokens = fairseq_model._prepare_tokens_and_attributes(["gen"], None)
-            original_logits = fairseq_model.lm.forward(
-                decoder_input_ids.reshape(1, decoder_config.num_codebooks, -1), attributes
-            )
-
-            torch.testing.assert_close(
-                original_logits.squeeze(2).reshape(decoder_config.num_codebooks, -1),
-                logits[:, -1],
-                rtol=1e-5,
-                atol=5e-5,
-            )
-
-    if pytorch_dump_folder is not None:
-        Path(pytorch_dump_folder).mkdir(exist_ok=True)
-        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
-        model.save_pretrained(pytorch_dump_folder)
-        processor.save_pretrained(pytorch_dump_folder)
-
-    if repo_id:
-        logger.info(f"Pushing model {checkpoint} to {repo_id}")
-        model.push_to_hub(repo_id, create_pr=True)
-        processor.push_to_hub(repo_id, create_pr=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint",
-        default="facebook/musicgen-melody",
-        type=str,
-        help="Checkpoint size of the Musicgen Melody model you'd like to convert. Can be one of: "
-        "`['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, or "
-        "`['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
-        "for the stereo checkpoints.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default="musicgen-melody",
-        type=str,
-        help="Where to upload the converted model on the 🤗 hub.",
-    )
-    parser.add_argument(
-        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument("--test_same_output", default=False, type=bool, help="If `True`, test if same output logits.")
-
-    args = parser.parse_args()
-    convert_musicgen_melody_checkpoint(
-        args.checkpoint, args.pytorch_dump_folder, args.push_to_hub, args.device, args.test_same_output
-    )
diff --git a/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 39653e4b1c77..000000000000
--- a/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The MyT5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MyT5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-# Copied from transformers.models.t5.convert_t5_original_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained MyT5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py b/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
deleted file mode 100644
index b9b1e9c56b06..000000000000
--- a/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-from argparse import ArgumentParser
-from collections import OrderedDict
-
-import torch
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-from nemo.utils import logging
-from pytorch_lightning import Trainer
-
-from transformers import LlamaTokenizer, PreTrainedTokenizerFast
-from transformers.convert_slow_tokenizer import LlamaConverter
-
-
-"""
-Script to convert a nemotron checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
-This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
-
-1) Generate only HF weights from a nemo file:
-
-    python convert_nemotron_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --output_path /path/to/pytorch_model.bin
-
-2) Generate the full HF model folder
-
-    python convert_nemotron_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --hf_input_path /path/to/input_hf_folder \
-    --hf_output_path /path/to/output_hf_folder \
-
-    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Nemotron4 340b).
-    However this option makes the conversion script significantly slower.
-"""
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--input_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to .nemo file or extracted folder",
-    )
-    parser.add_argument("--output_path", type=str, default=None, required=False, help="Path to HF .bin file")
-    parser.add_argument(
-        "--hf_input_path",
-        type=str,
-        default=None,
-        help="A HF model path, " "e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
-    )
-    parser.add_argument(
-        "--hf_output_path",
-        type=str,
-        default=None,
-        help="Output HF model path, " "with the same format as above but user's own weights",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        default=None,
-        help="Precision of output weights."
-        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
-    )
-    parser.add_argument(
-        "--cpu-only",
-        action="store_true",
-        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
-        "but this option makes the conversion script significantly slower.",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path, hf_url="nvidia/Minitron-8B-Base"):
-    """
-    Convert NeMo config to HF config
-    """
-    NEMO_ACT2HF = {
-        "squared-relu": "relu2",
-        "fast-swiglu": "silu",
-    }
-    DTYPE2HF = {
-        torch.bfloat16: "bfloat16",
-        torch.float16: "float16",
-        torch.float32: "float32",
-    }
-    hf_config = {
-        "_name_or_path": hf_url,
-        "architectures": ["NemotronForCausalLM"],
-        "bos_token_id": tokenizer.bos_id,
-        "eos_token_id": tokenizer.eos_id,
-        "hidden_act": NEMO_ACT2HF[nemo_config.activation],
-        "hidden_size": nemo_config.hidden_size,
-        "initializer_range": nemo_config.init_method_std,
-        "intermediate_size": nemo_config.ffn_hidden_size,
-        "max_position_embeddings": nemo_config.max_position_embeddings,
-        "model_type": "nemotron",
-        "num_attention_heads": nemo_config.num_attention_heads,
-        "num_hidden_layers": nemo_config.num_layers,
-        "num_key_value_heads": nemo_config.get("num_query_groups", nemo_config.num_attention_heads),
-        "norm_eps": nemo_config.layernorm_epsilon,
-        "rope_theta": nemo_config.get("rotary_base", 10000),
-        "partial_rotary_factor": nemo_config.get("rotary_percentage", 1.0),
-        "tie_word_embeddings": False,
-        "torch_dtype": DTYPE2HF[dtype],
-        "transformers_version": "4.32.0.dev0",  # TODO
-        "use_cache": True,
-        "vocab_size": vocab_size,
-    }
-    if nemo_config.kv_channels is not None:
-        hf_config["kv_channels"] = nemo_config.kv_channels
-    json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2)
-
-
-def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
-    """
-    Convert NeMo weights to HF weights
-    """
-    dummy_trainer = Trainer(devices=1, accelerator="cpu", strategy=NLPDDPStrategy())
-    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
-    model_config.tensor_model_parallel_size = 1
-    model_config.pipeline_model_parallel_size = 1
-    model_config.sequence_parallel = False
-    model_config.transformer_engine = True
-    if cpu_only:
-        map_location = torch.device("cpu")
-        model_config.use_cpu_initialization = True
-        model_config.dist_ckpt_load_on_device = False
-    else:
-        map_location = None
-
-    if cpu_only:
-        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
-
-    model = MegatronGPTModel.restore_from(
-        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
-    )
-
-    vocab_size = model.padded_vocab_size
-
-    if precision is None:
-        precision = model.cfg.precision
-    if precision in [32, "32"]:
-        dtype = torch.float32
-    elif precision in [16, "16", "16-mixed"]:
-        dtype = torch.float16
-    elif precision in ["bf16", "bf16-mixed"]:
-        dtype = torch.bfloat16
-    else:
-        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
-        dtype = torch.float32  # fallback
-    logging.info(f"Using precision {dtype}")
-
-    def param_to_weights(param):
-        return param.to(dtype)
-
-    checkpoint = OrderedDict()
-
-    hidden_size = model.cfg.hidden_size
-    head_num = model.cfg.num_attention_heads
-    num_layers = model.cfg.num_layers
-    ffn_hidden_size = model.cfg.ffn_hidden_size
-    num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
-    if num_query_groups is None:
-        num_query_groups = head_num
-    heads_per_group = head_num // num_query_groups
-    qkv_total_dim = head_num + 2 * num_query_groups
-
-    # Embedding
-    embed_weight = model.state_dict()["model.embedding.word_embeddings.weight"]
-    embed_weights_base_name = "model.embed_tokens.weight"
-    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
-
-    for l in range(int(num_layers)):
-        print(f"converting layer {l}")
-
-        qkv_weights = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.weight"]
-        qkv_weights = qkv_weights.reshape([qkv_total_dim, -1, hidden_size])
-
-        q_slice = torch.cat(
-            [
-                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-                for i in range(num_query_groups)
-            ]
-        )
-        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-        ## Example of slices
-        ## (without GQA): num_query_groups = head_num = 32,
-        ## q_slice = [0, 3, 6, 9 , ... 90, 93]
-        ## k_slice = [1, 4, 7, 10, ... 91, 94]
-        ## v_slice = [2, 5, 8, 11, ... 92, 95]
-        ## (with GQA): num_query_groups = 8, head_num = 64
-        ## q_slice = [0, 1, .. 6, 7, 10, 11, .. 16, 17, 20, 21, .. 67, 70, ... 76, 77]
-        ## k_slice = [8, 18, 28, ... 68, 78]
-        ## v_slice = [9, 19, 29, ... 69, 79]
-
-        q_weights_base_name = f"model.layers.{l}.self_attn.q_proj.weight"
-        k_weights_base_name = f"model.layers.{l}.self_attn.k_proj.weight"
-        v_weights_base_name = f"model.layers.{l}.self_attn.v_proj.weight"
-
-        checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
-        checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
-        checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
-
-        # attention dense
-        o_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_proj.weight"]
-        o_weight_base_name = f"model.layers.{l}.self_attn.o_proj.weight"
-        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
-
-        # mlp
-        mlp_weights = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.weight"]
-        mlp_up_proj_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc2.weight"]
-
-        if mlp_weights.shape[0] != mlp_up_proj_weight.shape[1]:
-            # Has projection (used for swi-glu)
-            logging.warning(
-                "Gated projection layers detected in NeMo checkpoint. Currently Nemotron HF does not support gated MLP."
-            )
-            assert mlp_weights.shape[0] == 2 * mlp_up_proj_weight.shape[1]
-
-            mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :]
-            mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :]
-
-            mlp_down_proj_base_name = f"model.layers.{l}.mlp.gate_proj.weight"
-            mlp_gate_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
-
-            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
-            checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight)
-        else:
-            mlp_down_proj_weight = mlp_weights
-            mlp_down_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
-            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
-
-        mlp_up_proj_base_name = f"model.layers.{l}.mlp.down_proj.weight"
-        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
-
-        # layernorm
-        input_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight"]
-        input_ln_base_name = f"model.layers.{l}.input_layernorm.weight"
-        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
-        if (
-            model.state_dict().get(f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias", None)
-            is not None
-        ):
-            input_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias"]
-            input_ln_bias_name = f"model.layers.{l}.input_layernorm.bias"
-            checkpoint[input_ln_bias_name] = param_to_weights(input_ln_bias)
-
-        post_attn_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight"]
-        post_attn_ln_base_name = f"model.layers.{l}.post_attention_layernorm.weight"
-        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
-        if model.state_dict().get(f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias", None) is not None:
-            post_attn_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias"]
-            post_attn_ln_bias_name = f"model.layers.{l}.post_attention_layernorm.bias"
-            checkpoint[post_attn_ln_bias_name] = param_to_weights(post_attn_ln_bias)
-
-        print(f"done layer {l}")
-
-    final_ln_weight = model.state_dict()["model.decoder.final_layernorm.weight"]
-    final_ln_base_name = "model.norm.weight"
-    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
-    if model.state_dict().get("model.decoder.final_layernorm.bias", None) is not None:
-        final_ln_bias = model.state_dict()["model.decoder.final_layernorm.bias"]
-        final_ln_bias_name = "model.norm.bias"
-        checkpoint[final_ln_bias_name] = param_to_weights(final_ln_bias)
-
-    output_layer_weight = model.state_dict()["model.output_layer.weight"]
-    output_layer_base_name = "lm_head.weight"
-    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
-
-    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
-    torch.save(checkpoint, output_hf_file)
-    logging.info(f"Weights saved to {output_hf_file}")
-
-    return model_config, model.tokenizer, dtype, vocab_size
-
-
-def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tokenizer):
-    tokenizer_cfg = model_config.tokenizer
-    if tokenizer_cfg.library == "sentencepiece":
-        # For sentencepiece tokenizer, we are wrapping with HF's LlamaTokenizer
-        # and convert it to a PreTrainedTokenizerFast
-        tokenizer_fn = tokenizer_cfg.model[5:]
-        output_tokenizer = f"{output_hf_path}/tokenizer.model"
-        if nemo_file.endswith(".nemo"):
-            import tarfile
-
-            archive = tarfile.open(nemo_file, "r")
-            tokenizer_filename = "./" + tokenizer_fn  # exclude 'nemo:' prefix
-            archive.extract(tokenizer_filename, output_hf_path)
-            archive.close()
-            os.rename(f"{output_hf_path}/{tokenizer_fn}", output_tokenizer)
-        elif os.path.isdir(nemo_file):
-            shutil.copy(f"{nemo_file}/{tokenizer_fn}", output_tokenizer)
-        # We use LlamaTokenizer for sentencepiece based tokenizer
-        tokenizer = LlamaTokenizer.from_pretrained(output_hf_path, legacy=False)
-        # Convert the LlamaTokenizer to a PreTrainedTokenizerFast instance
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=LlamaConverter(tokenizer).converted(), model_input_names=["input_ids", "token_type_ids"]
-        )
-        tokenizer.save_pretrained(output_hf_path)
-        logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}")
-    elif isinstance(nemo_tokenizer, AutoTokenizer):
-        nemo_tokenizer.tokenizer.save_pretrained(output_hf_path)
-        logging.info(f"HF AutoTokenizer has been saved to {output_hf_path}")
-    else:
-        raise ValueError(f"Unsupported tokenizer type: library: {tokenizer_cfg.library}, type: {tokenizer_cfg.type}")
-
-
-if __name__ == "__main__":
-    args = get_args()
-    if not args.hf_output_path:
-        assert args.output_path is not None, "Need to provide either output_path or hf_output_path"
-    else:
-        args.output_path = f"{args.hf_output_path}/pytorch_model.bin"
-        logging.info(f"weight will be saved to {args.output_path}")
-
-    nemo_config, nemo_tokenizer, dtype, vocab_size = convert(
-        args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only
-    )
-    if args.hf_input_path and args.hf_output_path:
-        convert_hf_config(nemo_config, nemo_tokenizer, vocab_size, dtype, args.hf_output_path, args.hf_input_path)
-        extract_nemotron_tokenizer(args.input_name_or_path, nemo_config, args.hf_output_path, nemo_tokenizer)
-    else:
-        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
-        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 5f98c0ca3d92..000000000000
--- a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import torch
-from torch import nn
-
-from transformers import NllbMoeConfig, NllbMoeModel
-from transformers.modeling_utils import dtype_byte_size
-from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def rename_fairseq_keys(state_dict, expert_idx=None):
-    new_dict = {}
-    for old_key in state_dict.keys():
-        key = old_key
-        if "moe_layer.experts." in key:
-            if expert_idx is not None:
-                key = key.replace("moe_layer.experts.0", f"ffn.experts.expert_{expert_idx}")
-            else:
-                key = key.replace("moe_layer.experts.", "ffn.experts.expert_")
-        if "gate" in key:
-            key = key.replace(".moe_layer.gate.wg", ".ffn.router.classifier")
-        if "fc2" and "experts" not in key:
-            key = key.replace(".fc2.", ".ffn.fc2.")
-        if "fc1" and "experts" not in key:
-            key = key.replace(".fc1.", ".ffn.fc1.")
-        if ".encoder_attn." in key:
-            key = key.replace(".encoder_attn.", ".cross_attention.")
-        if "encoder_attn_layer_norm" in key:
-            key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
-        if "final_layer_norm" in key:
-            key = key.replace("final_layer_norm", "ff_layer_norm")
-        new_dict[key] = state_dict[old_key]
-    return new_dict
-
-
-def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
-    sharded_state_dicts = []
-    total_size = 0
-    os.makedirs(dump_path, exist_ok=True)
-
-    for expert in range(num_experts):
-        expert_path = switch_checkpoint_path + f"-rank-{expert}.pt"
-        if os.path.isfile(expert_path):
-            expert_state = torch.load(expert_path)["model"]
-            remove_ignore_keys_(expert_state)
-            expert_state = rename_fairseq_keys(expert_state, expert)
-            save_path = os.path.join(
-                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
-            )
-            torch.save(expert_state, save_path)
-            sharded_state_dicts.append(expert_state.keys())
-            total_size += sum([value.numel() for key, value in expert_state.items()]) * dtype_byte_size(
-                expert_state[list(expert_state)[0]].dtype
-            )
-
-    # Add the last block
-    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
-    shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
-    remove_ignore_keys_(shared_weights)
-    shared_weights = rename_fairseq_keys(shared_weights, None)
-    shared_weights["shared.weight"] = shared_weights["decoder.embed_tokens.weight"]
-    sharded_state_dicts.append(shared_weights.keys())
-
-    # If we only have the shared weights (dummy model/experts saved on the same file)
-    if len(sharded_state_dicts) == 1:
-        save_path = os.path.join(dump_path, weights_name)
-        torch.save(shared_weights, save_path)
-        return {weights_name: sharded_state_dicts[0]}, None
-    else:
-        torch.save(shared_weights, save_path)
-    # Otherwise, let's build the index
-    weight_map = {}
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
-        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
-        os.rename(temp_filename, os.path.join(dump_path, shard_file))
-        for key in shard:
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-
-    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    return metadata, index
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--nllb_moe_checkpoint_path",
-        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/model_moe_54b/checkpoint_2_300000",
-        type=str,
-        required=False,
-        help="Path to a directory containing a folder per layer. Follows the original Google format.",
-    )
-    parser.add_argument("--dtype", default="float32", type=str, required=False, help="dtype of the saved model")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/hf-converted-moe-54b",
-        type=str,
-        required=False,
-        help="Path to the output pytorch model.",
-    )
-    args = parser.parse_args()
-    metadata, index = shard_on_the_fly(
-        args.nllb_moe_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        128,
-        args.dtype,
-    )
-
-    config = NllbMoeConfig.from_pretrained(
-        "facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
-    )
-    config.save_pretrained(args.pytorch_dump_folder_path)
-    model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
-    print("Done")
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/nougat/convert_nougat_to_hf.py b/src/transformers/models/nougat/convert_nougat_to_hf.py
deleted file mode 100644
index e42f8553ac4f..000000000000
--- a/src/transformers/models/nougat/convert_nougat_to_hf.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Nougat checkpoints using the original `nougat` library. URL:
-https://github.com/facebookresearch/nougat/tree/main"""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-from nougat import NougatModel
-from nougat.dataset.rasterize import rasterize_paper
-from nougat.utils.checkpoint import get_checkpoint
-from PIL import Image
-
-from transformers import (
-    DonutSwinConfig,
-    DonutSwinModel,
-    MBartConfig,
-    MBartForCausalLM,
-    NougatImageProcessor,
-    NougatProcessor,
-    NougatTokenizerFast,
-    VisionEncoderDecoderModel,
-)
-
-
-def get_configs(model):
-    original_config = model.config
-
-    encoder_config = DonutSwinConfig(
-        image_size=original_config.input_size,
-        patch_size=4,
-        depths=original_config.encoder_layer,
-        num_heads=[4, 8, 16, 32],
-        window_size=original_config.window_size,
-        embed_dim=128,
-    )
-    decoder_config = MBartConfig(
-        is_decoder=True,
-        is_encoder_decoder=False,
-        add_cross_attention=True,
-        decoder_layers=original_config.decoder_layer,
-        max_position_embeddings=original_config.max_position_embeddings,
-        vocab_size=len(
-            model.decoder.tokenizer
-        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
-        scale_embedding=True,
-        add_final_layer_norm=True,
-        tie_word_embeddings=False,
-    )
-
-    return encoder_config, decoder_config
-
-
-# Copied from transformers.models.donut.convert_donut_to_pytorch.rename_key
-def rename_key(name):
-    if "encoder.model" in name:
-        name = name.replace("encoder.model", "encoder")
-    if "decoder.model" in name:
-        name = name.replace("decoder.model", "decoder")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if name.startswith("encoder"):
-        if "layers" in name:
-            name = "encoder." + name
-        if "attn.proj" in name:
-            name = name.replace("attn.proj", "attention.output.dense")
-        if "attn" in name and "mask" not in name:
-            name = name.replace("attn", "attention.self")
-        if "norm1" in name:
-            name = name.replace("norm1", "layernorm_before")
-        if "norm2" in name:
-            name = name.replace("norm2", "layernorm_after")
-        if "mlp.fc1" in name:
-            name = name.replace("mlp.fc1", "intermediate.dense")
-        if "mlp.fc2" in name:
-            name = name.replace("mlp.fc2", "output.dense")
-
-        if name == "encoder.norm.weight":
-            name = "encoder.layernorm.weight"
-        if name == "encoder.norm.bias":
-            name = "encoder.layernorm.bias"
-
-    return name
-
-
-# Copied from transformers.models.donut.convert_donut_to_pytorch.convert_state_dict
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            block_num = int(key_split[5])
-            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
-            # HuggingFace implementation doesn't use attn_mask buffer
-            # and model doesn't use final LayerNorms for the encoder
-            pass
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_nougat_checkpoint(model_tag, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    checkpoint_path = get_checkpoint(None, model_tag)
-    original_model = NougatModel.from_pretrained(checkpoint_path)
-    original_model.eval()
-
-    # load HuggingFace model
-    encoder_config, decoder_config = get_configs(original_model)
-    encoder = DonutSwinModel(encoder_config)
-    decoder = MBartForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results on PDF
-    filepath = hf_hub_download(repo_id="ysharma/nougat", filename="input/nougat.pdf", repo_type="space")
-    images = rasterize_paper(pdf=filepath, return_pil=True)
-    image = Image.open(images[0])
-
-    tokenizer_file = checkpoint_path / "tokenizer.json"
-    tokenizer = NougatTokenizerFast(tokenizer_file=str(tokenizer_file))
-    tokenizer.pad_token = "<pad>"
-    tokenizer.bos_token = "<s>"
-    tokenizer.eos_token = "</s>"
-    tokenizer.unk_token = "<unk>"
-    tokenizer.model_max_length = original_model.config.max_length
-
-    size = {"height": original_model.config.input_size[0], "width": original_model.config.input_size[1]}
-    image_processor = NougatImageProcessor(
-        do_align_long_axis=original_model.config.align_long_axis,
-        size=size,
-    )
-    processor = NougatProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # verify pixel_values
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-    original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    # verify patch embeddings
-    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
-    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
-    assert torch.allclose(original_patch_embed, patch_embeddings)
-
-    # verify encoder hidden states
-    original_last_hidden_state = original_model.encoder(pixel_values)
-    last_hidden_state = model.encoder(pixel_values).last_hidden_state
-    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
-
-    # NOTE original model does not use tied weights for embeddings of decoder
-    original_embeddings = original_model.decoder.model.model.decoder.embed_tokens
-    embeddings = model.decoder.model.decoder.embed_tokens
-    assert torch.allclose(original_embeddings.weight, embeddings.weight, atol=1e-3)
-
-    # verify decoder hidden states
-    prompt = "hello world"
-    decoder_input_ids = original_model.decoder.tokenizer(
-        prompt, add_special_tokens=False, return_tensors="pt"
-    ).input_ids
-    decoder_attention_mask = torch.ones_like(decoder_input_ids)
-    original_logits = original_model(
-        image_tensors=pixel_values, decoder_input_ids=decoder_input_ids, attention_mask=decoder_attention_mask
-    ).logits
-    logits = model(
-        pixel_values,
-        decoder_input_ids=decoder_input_ids[:, :-1],
-        decoder_attention_mask=decoder_attention_mask[:, :-1],
-    ).logits
-    assert torch.allclose(original_logits, logits, atol=1e-3)
-
-    # verify generation
-    outputs = model.generate(
-        pixel_values,
-        min_length=1,
-        max_length=30,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        use_cache=True,
-        bad_words_ids=[
-            [tokenizer.unk_token_id],
-        ],
-        return_dict_in_generate=True,
-        do_sample=False,
-    )
-    generated = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
-
-    if model_tag == "0.1.0-base":
-        expected_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lblec"
-    elif model_tag == "0.1.0-small":
-        expected_generation = (
-            "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lble"
-        )
-    else:
-        raise ValueError(f"Unexpected model tag: {model_tag}")
-
-    assert generated == expected_generation
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        tag_to_name = {"0.1.0-base": "nougat-base", "0.1.0-small": "nougat-small"}
-        model_name = tag_to_name[model_tag]
-
-        model.push_to_hub(f"facebook/{model_name}")
-        processor.push_to_hub(f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_tag",
-        default="0.1.0-base",
-        required=False,
-        type=str,
-        choices=["0.1.0-base", "0.1.0-small"],
-        help="Tag of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_nougat_checkpoint(args.model_tag, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8d5a52bdbf82..000000000000
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert Nystromformer checkpoints from the original repository."""
-
-import argparse
-
-import torch
-
-from transformers import NystromformerConfig, NystromformerForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff1" in orig_key:
-        orig_key = orig_key.replace("ff1", "intermediate.dense")
-    if "ff2" in orig_key:
-        orig_key = orig_key.replace("ff2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "cls" not in orig_key:
-        orig_key = "nystromformer." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(config, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key) or ("conv.bias" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["nystromformer.embeddings.position_ids"] = (
-        torch.arange(config.max_position_embeddings).expand((1, -1)) + 2
-    )
-
-    return orig_state_dict
-
-
-def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = NystromformerConfig.from_json_file(nystromformer_config_file)
-    model = NystromformerForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config, orig_state_dict)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Nystromformer pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for Nystromformer model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_nystromformer_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py b/src/transformers/models/olmo/convert_olmo_weights_to_hf.py
deleted file mode 100644
index 0e77bdc69e7a..000000000000
--- a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import OlmoConfig, OlmoForCausalLM
-from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/olmo/convert_olmo_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import OlmoForCausalLM, AutoTokenizer
-
-model = OlmoForCausalLM.from_pretrained("/output/path")
-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, tokenizer_path=None, safe_serialization=True, fix_eos_token_id=True):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmo_config = yaml.safe_load(config_path.read_text())["model"]
-
-    n_layers = olmo_config["n_layers"]
-    n_heads = olmo_config["n_heads"]
-    dim = olmo_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo_config["max_sequence_length"]
-
-    vocab_size = olmo_config.get("embedding_size", olmo_config["vocab_size"])
-
-    if olmo_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    # (The sharded implementation would also work, but this is simpler.)
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        # Unsharded
-        # TODO: Layernorm stuff
-        # TODO: multi query attention
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        up_proj_weight, gate_proj_weight = torch.chunk(
-            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
-            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
-            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
-        }
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    # TODO: Deal with weight-tying
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"]
-        if "transformer.ff_out.weight" in loaded
-        else loaded["transformer.wte.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    if olmo_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo_config["mlp_hidden_size"] // 2
-    else:
-        intermediate_size = (dim * olmo_config["mlp_ratio"]) // 2
-
-    config = OlmoConfig(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=intermediate_size,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmo_config["eos_token_id"],
-        tie_word_embeddings=olmo_config["weight_tying"],
-        rope_theta=base,
-        clip_qkv=olmo_config.get("clip_qkv"),
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if tokenizer_path is not None:
-        _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id)
-
-    print("Loading the checkpoint in a OLMo model.")
-    model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path, config: OlmoConfig, input_tokenizer_path: Path, fix_eos_token_id: bool = True
-) -> None:
-    print(f"Saving a {GPTNeoXTokenizerFast.__name__} to {output_path}.")
-
-    base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    if fix_eos_token_id and eos_token_id == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        eos_token_id = 50279
-
-    tokenizer = GPTNeoXTokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-        unk_token=None,
-        bos_token=None,
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMo weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        default=None,
-        help="Location of OLMo tokenizer json file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    # Different OLMo versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
deleted file mode 100644
index 43837fc14c25..000000000000
--- a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-from typing import Any, Dict
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import Olmo2Config, Olmo2ForCausalLM
-from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo2/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Olmo2ForCausalLM, AutoTokenizer
-
-model = Olmo2ForCausalLM.from_pretrained("/output/path")
-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    include_tokenizer=True,
-    tokenizer_path=None,
-    safe_serialization=True,
-    fix_eos_token_id=True,
-    tmp_cleanup=True,
-):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmo2_config = yaml.safe_load(config_path.read_text())["model"]
-
-    if not olmo2_config.get("attention_layer_norm", False):
-        raise RuntimeError("OLMo2 checkpoints must have attention layer norm")
-    if not olmo2_config.get("norm_after", False):
-        raise RuntimeError("OLMo2 checkpoints must set norm_after to True")
-
-    n_layers = olmo2_config["n_layers"]
-    n_heads = olmo2_config["n_heads"]
-    dim = olmo2_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = olmo2_config["rope_theta"]
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo2_config["max_sequence_length"]
-
-    vocab_size = olmo2_config.get("embedding_size", olmo2_config["vocab_size"])
-
-    if olmo2_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo2_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo2_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    # (The sharded implementation would also work, but this is simpler.)
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict: Dict[str, Any] = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        # Unsharded
-        # TODO: Layernorm stuff
-        # TODO: multi query attention
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        up_proj_weight, gate_proj_weight = torch.chunk(
-            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
-            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
-            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
-            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
-            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.attn_norm.weight"
-            ],
-            f"model.layers.{layer_i}.post_feedforward_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.ff_norm.weight"
-            ],
-        }
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    # TODO: Deal with weight-tying
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "model.norm.weight": loaded["transformer.ln_f.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"]
-        if "transformer.ff_out.weight" in loaded
-        else loaded["transformer.wte.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    if olmo2_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo2_config["mlp_hidden_size"] // 2
-    else:
-        intermediate_size = (dim * olmo2_config["mlp_ratio"]) // 2
-
-    if fix_eos_token_id and olmo2_config["eos_token_id"] == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        olmo2_config["eos_token_id"] = 50279
-
-    config = Olmo2Config(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=intermediate_size,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo2_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmo2_config["eos_token_id"],
-        tie_word_embeddings=olmo2_config["weight_tying"],
-        rms_norm_eps=olmo2_config["layer_norm_eps"],
-        rope_theta=base,
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if include_tokenizer:
-        _write_tokenizer(model_path, config, input_base_path, tokenizer_path)
-
-    print("Loading the checkpoint in a OLMo2 model.")
-    model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    if tmp_cleanup:
-        # Make cleanup optional; attempting to `rmtree` the `tmp_model_path` causes
-        # errors if using NFS.
-        shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path,
-    config: Olmo2Config,
-    checkpoint_dir: str,
-    input_tokenizer_path: Path | None,
-) -> None:
-    print(f"Saving a {GPT2TokenizerFast.__name__} to {output_path}.")
-
-    if input_tokenizer_path is not None:
-        base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-    else:
-        config_path = Path(checkpoint_dir) / "config.yaml"
-        tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
-
-        # Initialize tokenizer and validate vocab size.
-        if Path(tokenizer_config["identifier"]).is_file():
-            base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
-        else:
-            base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    tokenizer = GPT2TokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMo2 weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--no_tokenizer",
-        action="store_false",
-        dest="include_tokenizer",
-        help="If set, do not convert OLMo tokenizer to HF tokenizer.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        type=Path,
-        default=None,
-        help="Location of OLMo2 tokenizer json file. Defaults to what is set in the config file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument(
-        "--no_tmp_cleanup",
-        action="store_false",
-        dest="tmp_cleanup",
-        help="If passed, don't remove temp dir at end of HF conversion.",
-    )
-    parser.add_argument(
-        "--no_safe_serialization",
-        action="store_false",
-        dest="safe_serialization",
-        help="Whether or not to save using `safetensors`.",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        include_tokenizer=args.include_tokenizer,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-        tmp_cleanup=args.tmp_cleanup,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py b/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py
deleted file mode 100644
index a14cd50a0e74..000000000000
--- a/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example for running:
-0. Cp ckpts to local
-aws s3 cp --recursive s3://ai2-llm/checkpoints/OLMoE/olmoe-8x1b-newhp-newds-final-annealFrom1200000/step23842 /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842
-1. Unshard your OLMoE checkpoint using https://github.com/allenai/OLMo/blob/7d63fe09d23cf23714da5aa633a44a90180195da/scripts/unshard.py
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/23485/step954000 /data/niklas/llm/checkpoints/1b-954000-unsharded --model-only
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/23485/step954000 /data/niklas/llm/checkpoints/1b-954000-unsharded --model-only
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842 /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842-unsharded --model-only
-2. Convert to transformers
-rm -rf olmoe; mkdir olmoe; python /data/niklas/transformers/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py --input_dir /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842-unsharded --tokenizer_json_path /data/niklas/llm/checkpoints/olmoe-step1200000-unsharded/tokenizer.json --output_dir olmoe
-3. Load model via:
-```
-from transformers import OlmoeForCausalLM, AutoTokenizer
-import torch
-model = OlmoeForCausalLM.from_pretrained("../transformers/olmoe", torch_dtype=torch.bfloat16).cuda()
-model = OlmoeForCausalLM.from_pretrained("../transformers/olmoe").cuda()
-tokenizer = AutoTokenizer.from_pretrained("../transformers/olmoe")
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.cuda() for k, v in inputs.items()}
-out = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(out[0]))
-# > # Bitcoin is a digital currency that is created and held electronically. No one controls it. Bitcoins aren’t printed, like dollars or euros – they’re produced by people and businesses running computers all around the world, using software that solves mathematical
-# Or quick sanity check:
-o = model(torch.tensor([[0, 1]]).cuda())
-# If the checkpoint is not converted to BF16 but kept in FP32:
-# > # Bitcoin is a digital currency that is not controlled by any central authority. It is a peer-to-peer payment system that allows users to send and receive payments from anywhere in the world. Bitcoin is also known as a cryptocurrency because it uses cryptography to secure transactions and prevent fraud.
-```
-
-Note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-
-Compare with OLMo codebase:
-```
-from olmo.model import OLMo
-import torch
-model = OLMo.from_checkpoint("/data/niklas/llm/checkpoints/olmoe-step1200000-unsharded-pt")
-model = model.cuda()
-model = model.to(torch.bfloat16)
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("../transformers/olmoe")
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.cuda() for k, v in inputs.items()}
-out = model.generate(**inputs)
-print(tokenizer.decode(out[0][0][0]))
-# Bitcoin is a digital currency that is created and held electronically. No one controls it. Bitcoins aren’t printed, like dollars or euros – they’re produced by people and businesses running computers all around the world, using software that solves mathematical problems. It’s the first example of a growing category of money
-# Or quick sanity check:
-o = model(torch.tensor([[0, 1]]).cuda())
-```
-"""
-
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import OlmoeConfig, OlmoeForCausalLM
-from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, tokenizer_path=None, safe_serialization=True, fix_eos_token_id=True):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmoe_config = yaml.safe_load(config_path.read_text())["model"]
-
-    if fix_eos_token_id:
-        olmoe_config["eos_token_id"] = 50279
-
-    n_layers = olmoe_config["n_layers"]
-    n_heads = olmoe_config["n_heads"]
-    dim = olmoe_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmoe_config["max_sequence_length"]
-
-    vocab_size = olmoe_config.get("embedding_size", olmoe_config["vocab_size"])
-
-    if olmoe_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmoe_config["n_kv_heads"]  # for GQA / MQA
-    elif olmoe_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
-            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
-            f"model.layers.{layer_i}.mlp.gate.weight": loaded[f"transformer.blocks.{layer_i}.ffn.router.layer.weight"],
-            f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"transformer.blocks.{layer_i}.attn_norm.weight"],
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.ff_norm.weight"
-            ],
-        }
-
-        num_experts = loaded[f"transformer.blocks.{layer_i}.ffn.router.layer.weight"].shape[0]
-        dim_per_expert = loaded[f"transformer.blocks.{layer_i}.ffn.experts.mlp.w1"].shape[0] // num_experts
-        for expert_i in range(num_experts):
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.gate_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.w1"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :]
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.up_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.v1"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :]
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.down_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.w2"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :].T.contiguous()
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"],
-        "model.norm.weight": loaded["transformer.ln_f.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    config = OlmoeConfig(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=dim_per_expert,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmoe_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmoe_config["eos_token_id"],
-        tie_word_embeddings=olmoe_config["weight_tying"],
-        rope_theta=base,
-        clip_qkv=olmoe_config.get("clip_qkv"),
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if tokenizer_path is not None:
-        _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id)
-
-    print("Loading the checkpoint in a OLMoE model.")
-    model = OlmoeForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path, config: OlmoeConfig, input_tokenizer_path: Path, fix_eos_token_id: bool = True
-) -> None:
-    print(f"Saving a {GPTNeoXTokenizerFast.__name__} to {output_path}.")
-
-    base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    if fix_eos_token_id and eos_token_id == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        eos_token_id = 50279
-
-    tokenizer = GPTNeoXTokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-        unk_token=None,
-        bos_token=None,
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMoE weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        default=None,
-        help="Location of OLMoE tokenizer json file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument(
-        "--safe_serialization", type=bool, default=True, help="Whether or not to save using `safetensors`."
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
deleted file mode 100644
index 2e515e983408..000000000000
--- a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OmDet-Turbo checkpoints from the original repository.
-
-URL: https://github.com/om-ai-lab/OmDet"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    CLIPTokenizer,
-    DetrImageProcessor,
-    OmDetTurboConfig,
-    OmDetTurboForObjectDetection,
-    OmDetTurboProcessor,
-)
-
-
-IMAGE_MEAN = [123.675, 116.28, 103.53]
-IMAGE_STD = [58.395, 57.12, 57.375]
-
-
-def get_omdet_turbo_config(model_name, use_timm_backbone):
-    if "tiny" in model_name:
-        window_size = 7
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        image_size = 640
-    else:
-        raise ValueError("Model not supported, only supports tiny variant.")
-
-    config = OmDetTurboConfig(
-        backbone_window_size=window_size,
-        backbone_image_size=image_size,
-        backbone_embed_dim=embed_dim,
-        backbone_depths=depths,
-        backbone_num_heads=num_heads,
-        backbone_out_indices=(1, 2, 3),
-        text_config={"model_type": "clip_text_model"},
-        use_timm_backbone=use_timm_backbone,
-        backbone="swin_tiny_patch4_window7_224" if use_timm_backbone else None,
-        apply_layernorm_after_vision_backbone=True if use_timm_backbone else False,
-        use_pretrained_backbone=False,
-    )
-
-    return config
-
-
-def create_rename_keys_vision(state_dict, config):
-    rename_keys = []
-    # fmt: off
-    ########################################## VISION BACKBONE - START
-    for layer_name in state_dict.keys():
-        if layer_name.startswith("backbone") and not layer_name.startswith("backbone.norm"):
-            if config.use_timm_backbone:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone.vision_backbone._backbone")
-                layer_name_replace = layer_name_replace.replace(".layers.", ".layers_")
-                if "downsample" in layer_name:
-                    # get layer number
-                    layer_num = int(layer_name.split(".")[2])
-                    layer_name_replace = layer_name_replace.replace(f"{layer_num}.downsample", f"{layer_num+1}.downsample")
-            else:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone.vision_backbone")
-                layer_name_replace = layer_name_replace.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-                layer_name_replace = layer_name_replace.replace("patch_embed.norm", "embeddings.norm")
-                if layer_name.startswith("backbone.layers"):
-                    layer_name_replace = layer_name_replace.replace("norm1", "layernorm_before")
-                    layer_name_replace = layer_name_replace.replace("norm2", "layernorm_after")
-                    layer_name_replace = layer_name_replace.replace("attn.proj", "attention.output.dense")
-                    layer_name_replace = layer_name_replace.replace("mlp.fc1", "intermediate.dense")
-                    layer_name_replace = layer_name_replace.replace("mlp.fc2", "output.dense")
-                    layer_name_replace = layer_name_replace.replace(".layers.", ".encoder.layers.")
-                    layer_name_replace = layer_name_replace.replace(".attn.", ".attention.self.")
-        elif layer_name.startswith("backbone.norm"):
-            layer_num = int(layer_name.split("norm")[1].split(".")[0])
-            if config.use_timm_backbone:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone")
-                layer_name_replace = layer_name_replace.replace(f"norm{layer_num}", f"layer_norms.{layer_num-1}")
-            else:
-                layer_name_replace = layer_name.replace(f"backbone.norm{layer_num}", f"vision_backbone.vision_backbone.hidden_states_norms.stage{layer_num+1}")
-        else:
-            continue
-        rename_keys.append((layer_name, layer_name_replace))
-    ########################################## VISION BACKBONE - END
-
-    ########################################## ENCODER - START
-    for layer_name, params in state_dict.items():
-        if "neck" in layer_name:
-            layer_name_replace = layer_name.replace("neck", "encoder")
-            layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
-            if "fpn_blocks" in layer_name or "pan_blocks" in layer_name or "lateral_convs" in layer_name or "downsample_convs" in layer_name:
-                layer_name_replace = layer_name_replace.replace(".m.", ".bottlenecks.")
-                layer_name_replace = layer_name_replace.replace(".cv", ".conv")
-                layer_name_replace = layer_name_replace.replace(".bn", ".norm")
-            if "encoder_layer" in layer_name:
-                layer_name_replace = layer_name_replace.replace("encoder_layer", "encoder.0.layers.0")
-                layer_name_replace = layer_name_replace.replace(".linear", ".fc")
-                layer_name_replace = layer_name_replace.replace("norm1", "self_attn_layer_norm")
-                layer_name_replace = layer_name_replace.replace("norm2", "final_layer_norm")
-            rename_keys.append((layer_name, layer_name_replace))
-    ########################################## ENCODER - END
-
-    ########################################## DECODER - START
-    for layer_name, params in state_dict.items():
-        if layer_name.startswith("decoder"):
-            layer_name_replace = layer_name.replace("decoder.decoder.layers", "decoder.layers")
-            layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
-            layer_name_replace = layer_name_replace.replace("query_pos_head", "query_position_head")
-            layer_name_replace = layer_name_replace.replace("enc_bbox_head", "encoder_bbox_head")
-            layer_name_replace = layer_name_replace.replace("enc_output", "encoder_vision_features")
-            layer_name_replace = layer_name_replace.replace("dec_score_head", "decoder_class_head")
-            layer_name_replace = layer_name_replace.replace("dec_bbox_head", "decoder_bbox_head")
-            layer_name_replace = layer_name_replace.replace("enc_score_head", "encoder_class_head")
-            rename_keys.append((layer_name, layer_name_replace))
-    ########################################## DECODER - END
-    # fmt: on
-    return rename_keys
-
-
-def create_rename_keys_language(state_dict):
-    rename_keys = []
-    # fmt: off
-    for layer_name in state_dict.keys():
-        if layer_name.startswith("language_backbone") and not layer_name.startswith("language_backbone.text_projection"):
-            layer_name_replace = layer_name.replace("language_backbone", "language_backbone.model.text_model")
-            layer_name_replace = layer_name_replace.replace("transformer.resblocks", "encoder.layers")
-            layer_name_replace = layer_name_replace.replace("token_embedding", "embeddings.token_embedding")
-            layer_name_replace = layer_name_replace.replace("positional_embedding", "embeddings.position_embedding.weight")
-            layer_name_replace = layer_name_replace.replace(".attn", ".self_attn")
-            layer_name_replace = layer_name_replace.replace(".mlp.c_fc", ".mlp.fc1")
-            layer_name_replace = layer_name_replace.replace(".mlp.c_proj", ".mlp.fc2")
-            layer_name_replace = layer_name_replace.replace("ln_final", "final_layer_norm")
-            layer_name_replace = layer_name_replace.replace(".ln_", ".layer_norm")
-            rename_keys.append((layer_name, layer_name_replace))
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v_vision(state_dict, config):
-    state_dict_keys = list(state_dict.keys())
-    for layer_name_vision in state_dict_keys:
-        if layer_name_vision.startswith("vision_backbone") and "qkv" in layer_name_vision:
-            layer_num = int(layer_name_vision.split(".")[4])
-            hidden_size = config.backbone_config.embed_dim * 2**layer_num
-            if "weight" in layer_name_vision:
-                in_proj_weight = state_dict.pop(layer_name_vision)
-                state_dict[layer_name_vision.replace("qkv.weight", "key.weight")] = in_proj_weight[:hidden_size, :]
-                state_dict[layer_name_vision.replace("qkv.weight", "query.weight")] = in_proj_weight[
-                    hidden_size : hidden_size * 2, :
-                ]
-                state_dict[layer_name_vision.replace("qkv.weight", "value.weight")] = in_proj_weight[-hidden_size:, :]
-            elif "bias" in layer_name_vision:
-                in_proj_bias = state_dict.pop(layer_name_vision)
-                state_dict[layer_name_vision.replace("qkv.bias", "key.bias")] = in_proj_bias[:hidden_size]
-                state_dict[layer_name_vision.replace("qkv.bias", "query.bias")] = in_proj_bias[
-                    hidden_size : hidden_size * 2
-                ]
-                state_dict[layer_name_vision.replace("qkv.bias", "value.bias")] = in_proj_bias[-hidden_size:]
-
-
-def read_in_q_k_v_text(state_dict, config):
-    state_dict_keys = list(state_dict.keys())
-    hidden_size = config.text_config.projection_dim
-    for layer_name_text in state_dict_keys:
-        if layer_name_text.startswith("language_backbone") and "in_proj" in layer_name_text:
-            if "weight" in layer_name_text:
-                in_proj_weight = state_dict.pop(layer_name_text)
-                state_dict[layer_name_text.replace("in_proj_weight", "q_proj.weight")] = in_proj_weight[
-                    :hidden_size, :
-                ]
-                state_dict[layer_name_text.replace("in_proj_weight", "k_proj.weight")] = in_proj_weight[
-                    hidden_size : hidden_size * 2, :
-                ]
-                state_dict[layer_name_text.replace("in_proj_weight", "v_proj.weight")] = in_proj_weight[
-                    -hidden_size:, :
-                ]
-            elif "bias" in layer_name_text:
-                in_proj_bias = state_dict.pop(layer_name_text)
-                state_dict[layer_name_text.replace("in_proj_bias", "q_proj.bias")] = in_proj_bias[:hidden_size]
-                state_dict[layer_name_text.replace("in_proj_bias", "k_proj.bias")] = in_proj_bias[
-                    hidden_size : hidden_size * 2
-                ]
-                state_dict[layer_name_text.replace("in_proj_bias", "v_proj.bias")] = in_proj_bias[-hidden_size:]
-
-
-def read_in_q_k_v_encoder(state_dict, config):
-    embed_dim = config.encoder_hidden_dim
-    # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-    in_proj_weight = state_dict.pop("encoder.encoder.0.layers.0.self_attn.in_proj_weight")
-    in_proj_bias = state_dict.pop("encoder.encoder.0.layers.0.self_attn.in_proj_bias")
-    # next, add query, keys and values (in that order) to the state dict
-    state_dict["encoder.encoder.0.layers.0.self_attn.query.weight"] = in_proj_weight[:embed_dim, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.query.bias"] = in_proj_bias[:embed_dim]
-    state_dict["encoder.encoder.0.layers.0.self_attn.key.weight"] = in_proj_weight[embed_dim : embed_dim * 2, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.key.bias"] = in_proj_bias[embed_dim : embed_dim * 2]
-    state_dict["encoder.encoder.0.layers.0.self_attn.value.weight"] = in_proj_weight[-embed_dim:, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.value.bias"] = in_proj_bias[-embed_dim:]
-
-
-def read_in_q_k_v_decoder(state_dict, config):
-    for layer_num in range(config.decoder_num_layers):
-        embed_dim = config.decoder_hidden_dim
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"decoder.layers.{layer_num}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{layer_num}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{layer_num}.self_attn.query.weight"] = in_proj_weight[:embed_dim, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.query.bias"] = in_proj_bias[:embed_dim]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.key.weight"] = in_proj_weight[embed_dim : embed_dim * 2, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.key.bias"] = in_proj_bias[embed_dim : embed_dim * 2]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.value.weight"] = in_proj_weight[-embed_dim:, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.value.bias"] = in_proj_bias[-embed_dim:]
-
-
-def run_test(model, processor):
-    # We will verify our results on an image of cute cats
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    classes = ["cat", "remote"]
-    task = "Detect {}.".format(", ".join(classes))
-    inputs = processor(image, text=classes, task=task, return_tensors="pt")
-
-    # Running forward
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    predicted_slice = outputs[1][0, :3, :3]
-    print(predicted_slice)
-    expected_slice = torch.tensor([[0.9427, -2.5958], [0.2105, -3.4569], [-2.6364, -4.1610]])
-
-    assert torch.allclose(predicted_slice, expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-
-@torch.no_grad()
-def convert_omdet_turbo_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    use_timm_backbone = args.use_timm_backbone
-
-    checkpoint_mapping = {
-        "omdet-turbo-tiny": [
-            "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/OmDet-Turbo_tiny_SWIN_T.pth",
-            "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
-        ],
-    }
-    # Define default OmDetTurbo configuation
-    config = get_omdet_turbo_config(model_name, use_timm_backbone)
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict_vision = torch.hub.load_state_dict_from_url(checkpoint_url[0], map_location="cpu")["model"]
-    original_state_dict_vision = {k.replace("module.", ""): v for k, v in original_state_dict_vision.items()}
-
-    # Rename keys
-    new_state_dict = original_state_dict_vision.copy()
-    rename_keys_vision = create_rename_keys_vision(new_state_dict, config)
-
-    rename_keys_language = create_rename_keys_language(new_state_dict)
-
-    for src, dest in rename_keys_vision:
-        rename_key(new_state_dict, src, dest)
-
-    for src, dest in rename_keys_language:
-        rename_key(new_state_dict, src, dest)
-
-    if not use_timm_backbone:
-        read_in_q_k_v_vision(new_state_dict, config)
-    read_in_q_k_v_text(new_state_dict, config)
-    read_in_q_k_v_encoder(new_state_dict, config)
-    read_in_q_k_v_decoder(new_state_dict, config)
-    # add "model" prefix to all keys
-    new_state_dict = {f"model.{k}": v for k, v in new_state_dict.items()}
-
-    # Load HF model
-    model = OmDetTurboForObjectDetection(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    image_processor = DetrImageProcessor(
-        size={"height": config.backbone_image_size, "width": config.backbone_image_size},
-        do_rescale=False,
-        image_mean=IMAGE_MEAN,
-        image_std=IMAGE_STD,
-        do_pad=False,
-    )
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = OmDetTurboProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # end-to-end consistency test
-    run_test(model, processor)
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"omlab/{model_name}")
-        processor.push_to_hub(f"omlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="omdet-turbo-tiny",
-        type=str,
-        choices=["omdet-turbo-tiny"],
-        help="Name of the OmDetTurbo model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--use_timm_backbone", action="store_true", help="Whether or not to use timm backbone for vision backbone."
-    )
-
-    args = parser.parse_args()
-    convert_omdet_turbo_checkpoint(args)
diff --git a/src/transformers/models/oneformer/convert_to_hf_oneformer.py b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
deleted file mode 100644
index 6e88d8a0555f..000000000000
--- a/src/transformers/models/oneformer/convert_to_hf_oneformer.py
+++ /dev/null
@@ -1,1191 +0,0 @@
-# coding=utf-8
-# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert OneFormer checkpoints from the original repository. URL: https://github.com/SHI-Labs/OneFormer"""
-
-import os
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from PIL import Image
-from torch import Tensor, nn
-
-
-try:
-    from detectron2.checkpoint import DetectionCheckpointer
-    from detectron2.config import get_cfg
-    from detectron2.data import MetadataCatalog
-    from detectron2.projects.deeplab import add_deeplab_config
-except ImportError:
-    pass
-from transformers import CLIPTokenizer, DinatConfig, SwinConfig
-from transformers.models.oneformer.image_processing_oneformer import OneFormerImageProcessor
-from transformers.models.oneformer.modeling_oneformer import (
-    OneFormerConfig,
-    OneFormerForUniversalSegmentation,
-    OneFormerForUniversalSegmentationOutput,
-    OneFormerModel,
-    OneFormerModelOutput,
-)
-from transformers.models.oneformer.processing_oneformer import OneFormerProcessor
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# Image to verify the result
-def prepare_img():
-    url = "https://praeclarumjj3.github.io/files/coco.jpeg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by oneformer/detectron2 implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_common_config(cfg)
-    add_oneformer_config(cfg)
-    add_swin_config(cfg)
-    add_dinat_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalOneFormerConfigToOursConverter:
-    def __call__(self, original_config: object, is_swin: bool) -> OneFormerConfig:
-        model = original_config.MODEL
-
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
-        id2label = dict(enumerate(dataset_catalog.stuff_classes))
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        if is_swin:
-            if model.SWIN.EMBED_DIM == 96:
-                backbone_config = SwinConfig.from_pretrained(
-                    "microsoft/swin-tiny-patch4-window7-224",
-                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
-                    out_features=["stage1", "stage2", "stage3", "stage4"],
-                )
-            elif model.SWIN.EMBED_DIM == 192:
-                backbone_config = SwinConfig.from_pretrained(
-                    "microsoft/swin-large-patch4-window12-384",
-                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
-                    out_features=["stage1", "stage2", "stage3", "stage4"],
-                )
-            else:
-                raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
-        else:
-            backbone_config = DinatConfig.from_pretrained(
-                "shi-labs/dinat-large-11x11-in22k-in1k-384",
-                dilations=model.DiNAT.DILATIONS,
-                kernel_size=model.DiNAT.KERNEL_SIZE,
-                out_features=["stage1", "stage2", "stage3", "stage4"],
-            )
-
-        config: OneFormerConfig = OneFormerConfig(
-            backbone_config=backbone_config,
-            output_attentions=True,
-            output_hidden_states=True,
-            return_dict=True,
-            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            num_classes=model.SEM_SEG_HEAD.NUM_CLASSES,
-            num_queries=model.ONE_FORMER.NUM_OBJECT_QUERIES,
-            no_object_weight=model.ONE_FORMER.NO_OBJECT_WEIGHT,
-            class_weight=model.ONE_FORMER.CLASS_WEIGHT,
-            mask_weight=model.ONE_FORMER.MASK_WEIGHT,
-            dice_weight=model.ONE_FORMER.DICE_WEIGHT,
-            contrastive_weight=model.ONE_FORMER.CONTRASTIVE_WEIGHT,
-            contrastive_temperature=model.ONE_FORMER.CONTRASTIVE_TEMPERATURE,
-            train_num_points=model.ONE_FORMER.TRAIN_NUM_POINTS,
-            oversample_ratio=model.ONE_FORMER.OVERSAMPLE_RATIO,
-            importance_sample_ratio=model.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO,
-            init_std=0.02,
-            init_xavier_std=1.0,
-            layer_norm_eps=1e-05,
-            is_training=False,
-            use_auxiliary_loss=model.ONE_FORMER.DEEP_SUPERVISION,
-            output_auxiliary_logits=True,
-            strides=[4, 8, 16, 32],
-            task_seq_len=original_config.INPUT.TASK_SEQ_LEN,
-            max_seq_len=original_config.INPUT.MAX_SEQ_LEN,
-            text_encoder_width=model.TEXT_ENCODER.WIDTH,
-            text_encoder_context_length=model.TEXT_ENCODER.CONTEXT_LENGTH,
-            text_encoder_num_layers=model.TEXT_ENCODER.NUM_LAYERS,
-            text_encoder_vocab_size=model.TEXT_ENCODER.VOCAB_SIZE,
-            text_encoder_proj_layers=model.TEXT_ENCODER.PROJ_NUM_LAYERS,
-            text_encoder_n_ctx=model.TEXT_ENCODER.N_CTX,
-            conv_dim=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_dim=model.SEM_SEG_HEAD.MASK_DIM,
-            hidden_dim=model.ONE_FORMER.HIDDEN_DIM,
-            norm=model.SEM_SEG_HEAD.NORM,
-            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
-            encoder_feedforward_dim=1024,
-            decoder_layers=model.ONE_FORMER.DEC_LAYERS,
-            use_task_norm=model.ONE_FORMER.USE_TASK_NORM,
-            num_attention_heads=model.ONE_FORMER.NHEADS,
-            dropout=model.ONE_FORMER.DROPOUT,
-            dim_feedforward=model.ONE_FORMER.DIM_FEEDFORWARD,
-            pre_norm=model.ONE_FORMER.PRE_NORM,
-            enforce_input_proj=model.ONE_FORMER.ENFORCE_INPUT_PROJ,
-            query_dec_layers=model.ONE_FORMER.CLASS_DEC_LAYERS,
-            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
-            id2label=id2label,
-            label2id=label2id,
-        )
-
-        return config
-
-
-class OriginalOneFormerConfigToProcessorConverter:
-    def __call__(self, original_config: object, model_repo: str) -> OneFormerProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
-
-        if "ade20k" in model_repo:
-            class_info_file = "ade20k_panoptic.json"
-        elif "coco" in model_repo:
-            class_info_file = "coco_panoptic.json"
-        elif "cityscapes" in model_repo:
-            class_info_file = "cityscapes_panoptic.json"
-        else:
-            raise ValueError("Invalid Dataset!")
-
-        image_processor = OneFormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=dataset_catalog.ignore_label,
-            class_info_file=class_info_file,
-        )
-
-        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
-
-        return OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            task_seq_length=original_config.INPUT.TASK_SEQ_LEN,
-            max_seq_length=original_config.INPUT.MAX_SEQ_LEN,
-        )
-
-
-class OriginalOneFormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: OneFormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    # Swin Backbone
-    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Dinat Backbone
-    def replace_dinat_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = rename_keys_for_weight_bias(f"{src_prefix}.patch_embed.norm", f"{dst_prefix}.embeddings.norm")
-
-        for i in range(2):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(
-                    f"{src_prefix}.patch_embed.proj.{i}",
-                    f"{dst_prefix}.embeddings.patch_embeddings.projection.{i}",
-                )
-            )
-
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm1",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_before",
-                    )
-                )
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm2",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_after",
-                    )
-                )
-
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.rpb",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.rpb",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.proj",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.output.dense",
-                    )
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc1",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.intermediate.dense",
-                    )
-                )
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc2",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.output.dense",
-                    )
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Backbone + Pixel Decoder
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict, is_swin: bool):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        if is_swin:
-            self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
-        else:
-            self.replace_dinat_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            self_attn_keys = []
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
-            )
-            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
-
-            return self_attn_keys
-
-        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
-            encoder_keys = []
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
-            encoder_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
-            )
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
-            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
-
-            return encoder_keys
-
-        # convolution layer for final features
-        renamed_keys = [
-            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
-            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
-            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
-                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
-                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
-            ]
-        )
-
-        # proj layers
-        for i in range(3):
-            for j in range(2):
-                renamed_keys.extend(
-                    [
-                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
-                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
-                    ]
-                )
-
-        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
-
-        # layers
-        for layer_idx in range(self.config.encoder_layers):
-            renamed_keys.extend(
-                rename_keys_for_encoder_layer(
-                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
-                )
-            )
-
-        # proj
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            ]
-        )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Transformer Decoder
-    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder.layers"
-        src_prefix: str = "sem_seg_head.predictor"
-        for i in range(self.config.decoder_layers - 1):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
-            )
-            in_proj_bias = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
-            )
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = [
-                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
-                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
-            ]
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = []
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_query_transformer_layer(src_prefix: str, dst_prefix: str):
-            query_transformer_layer_keys = []
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.norm1")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.norm2")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm3", f"{dst_prefix}.norm3")
-            )
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
-            )
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
-            )
-
-            return query_transformer_layer_keys
-
-        def rename_keys_for_cross_attn_layer(src_prefix: str, dst_prefix: str):
-            cross_attn_layer_keys = []
-
-            cross_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-            cross_attn_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
-            )
-
-            return cross_attn_layer_keys
-
-        def rename_keys_for_self_attn_layer(src_prefix: str, dst_prefix: str):
-            self_attn_layer_keys = []
-
-            self_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-            self_attn_layer_keys.extend(
-                rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
-            )
-
-            return self_attn_layer_keys
-
-        def rename_keys_for_ffn_layer(src_prefix: str, dst_prefix: str):
-            ffn_layer_keys = []
-
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1"))
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2"))
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-
-            return ffn_layer_keys
-
-        def rename_keys_for_transformer_decoder_layer(src_prefix: str, dst_prefix: str, idx: int):
-            transformer_decoder_layer_keys = []
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_cross_attn_layer(
-                    f"{src_prefix}.transformer_cross_attention_layers.{idx}", f"{dst_prefix}.{idx}.cross_attn"
-                )
-            )
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_self_attn_layer(
-                    f"{src_prefix}.transformer_self_attention_layers.{idx}", f"{dst_prefix}.{idx}.self_attn"
-                )
-            )
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_ffn_layer(f"{src_prefix}.transformer_ffn_layers.{idx}", f"{dst_prefix}.{idx}.ffn")
-            )
-
-            return transformer_decoder_layer_keys
-
-        # positional embedding for object queries
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
-        ]
-
-        # norm
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(f"{src_prefix}.decoder_norm", f"{dst_prefix}.decoder.decoder_norm")
-        )
-
-        # proj
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(
-                f"{src_prefix}.class_input_proj", f"{dst_prefix}.decoder.query_input_projection"
-            )
-        )
-
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(f"{src_prefix}.class_embed", f"{dst_prefix}.decoder.class_embed")
-        )
-
-        for i in range(3):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(
-                    f"{src_prefix}.mask_embed.layers.{i}", f"{dst_prefix}.decoder.mask_embed.layers.{i}.0"
-                )
-            )
-
-        # norm
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(
-                f"{src_prefix}.class_transformer.decoder.norm", f"{dst_prefix}.decoder.query_transformer.decoder.norm"
-            )
-        )
-
-        # transformer to update queries with task tokens
-        for i in range(self.config.query_dec_layers):
-            renamed_keys.extend(
-                rename_keys_for_query_transformer_layer(
-                    f"{src_prefix}.class_transformer.decoder.layers.{i}",
-                    f"{dst_prefix}.decoder.query_transformer.decoder.layers.{i}",
-                )
-            )
-
-        # decoder layers
-        for i in range(self.config.decoder_layers - 1):
-            renamed_keys.extend(
-                rename_keys_for_transformer_decoder_layer(
-                    f"{src_prefix}",
-                    f"{dst_prefix}.decoder.layers",
-                    i,
-                )
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
-
-    def replace_task_mlp(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "task_encoder"
-        src_prefix: str = "task_mlp"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = []
-
-        for i in range(2):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.task_mlp.layers.{i}.0")
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_text_projector(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "text_mapper.text_projector"
-        src_prefix: str = "text_projector"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = []
-
-        for i in range(self.config.text_encoder_config["text_encoder_proj_layers"]):
-            renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.{i}.0"))
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_text_mapper(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "text_mapper.text_encoder"
-        src_prefix: str = "text_encoder"
-
-        self.replace_text_projector(dst_state_dict, src_state_dict)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = [
-                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
-                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
-            ]
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_layer(src_prefix: str, dst_prefix: str):
-            resblock_keys = []
-
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_fc", f"{dst_prefix}.mlp.fc1"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_proj", f"{dst_prefix}.mlp.fc2"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_1", f"{dst_prefix}.layer_norm1"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_2", f"{dst_prefix}.layer_norm2"))
-            resblock_keys.extend(rename_keys_for_attn(f"{src_prefix}.attn", f"{dst_prefix}.self_attn"))
-
-            return resblock_keys
-
-        renamed_keys = [
-            ("prompt_ctx.weight", "text_mapper.prompt_ctx.weight"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.positional_embedding", f"{dst_prefix}.positional_embedding"),
-                (f"{src_prefix}.token_embedding.weight", f"{dst_prefix}.token_embedding.weight"),
-            ]
-        )
-
-        renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_final", f"{dst_prefix}.ln_final"))
-
-        for i in range(self.config.text_encoder_config["text_encoder_num_layers"]):
-            renamed_keys.extend(
-                rename_keys_for_layer(
-                    f"{src_prefix}.transformer.resblocks.{i}", f"{dst_prefix}.transformer.layers.{i}"
-                )
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, oneformer: OneFormerModel, is_swin: bool) -> OneFormerModel:
-        dst_state_dict = TrackedStateDict(oneformer.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict, is_swin)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-        self.replace_task_mlp(dst_state_dict, src_state_dict)
-        if self.config.is_training:
-            self.replace_text_mapper(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        oneformer.load_state_dict(dst_state_dict)
-
-        return oneformer
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pth")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-            config: Path = config_dir / f"{checkpoint.stem}.yaml"
-
-            yield config, checkpoint
-
-
-def post_process_sem_seg_output(outputs: OneFormerForUniversalSegmentationOutput, target_size: Tuple[int, int]):
-    # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1]
-    class_queries_logits = outputs.class_queries_logits
-    # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH]
-    masks_queries_logits = outputs.masks_queries_logits
-    if target_size is not None:
-        masks_queries_logits = torch.nn.functional.interpolate(
-            masks_queries_logits,
-            size=target_size,
-            mode="bilinear",
-            align_corners=False,
-        )
-    # remove the null class `[..., :-1]`
-    masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
-    # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH]
-    masks_probs = masks_queries_logits.sigmoid()
-    # now we want to sum over the queries,
-    # $ out_{c,h,w} =  \sum_q p_{q,c} * m_{q,h,w} $
-    # where $ softmax(p) \in R^{q, c} $ is the mask classes
-    # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities
-    # b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth)
-    segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
-
-    return segmentation
-
-
-def test(
-    original_model,
-    our_model: OneFormerForUniversalSegmentation,
-    processor: OneFormerProcessor,
-    model_repo: str,
-):
-    def _preprocess_text(text_list=None, max_length=77):
-        if text_list is None:
-            raise ValueError("tokens cannot be None.")
-
-        tokens = tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
-
-        attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
-
-        token_inputs = []
-        for attn_mask, input_id in zip(attention_masks, input_ids):
-            token = torch.tensor(attn_mask) * torch.tensor(input_id)
-            token_inputs.append(token.unsqueeze(0))
-
-        token_inputs = torch.cat(token_inputs, dim=0)
-        return token_inputs
-
-    with torch.no_grad():
-        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-
-        tr = T.Compose(
-            [
-                T.Resize((640, 640)),
-                T.ToTensor(),
-                T.Normalize(
-                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
-                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
-                ),
-            ],
-        )
-
-        x = tr(im).unsqueeze(0)
-
-        task_input = ["the task is semantic"]
-        task_token = _preprocess_text(task_input, max_length=processor.task_seq_length)
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-
-        our_model_output: OneFormerModelOutput = our_model.model(x.clone(), task_token, output_hidden_states=True)
-
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-3
-            ), "The backbone features are not the same."
-        mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        original_pixel_decoder_features = []
-        original_pixel_decoder_features.append(mask_features)
-        for i in range(len(multi_scale_features)):
-            original_pixel_decoder_features.append(multi_scale_features[i])
-
-        for original_model_feature, our_model_feature in zip(
-            original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-4
-            ), "The pixel decoder feature are not the same"
-
-        tr_complete = T.Compose(
-            [
-                T.Resize((640, 640)),
-                T.ToTensor(),
-            ],
-        )
-
-        y = (tr_complete(im) * 255.0).to(torch.int).float()
-
-        # let's test the full model
-        original_model_out = original_model([{"image": y.clone(), "task": "The task is semantic"}])
-
-        original_segmentation = original_model_out[0]["sem_seg"]
-
-        our_model_out: OneFormerForUniversalSegmentationOutput = our_model(
-            x.clone(), task_token, output_hidden_states=True
-        )
-
-        our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
-
-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_name(checkpoint_file: Path):
-    model_name_raw: str = checkpoint_file.stem
-
-    backbone = "swin" if "swin" in model_name_raw else "dinat"
-    dataset = ""
-    if "coco" in model_name_raw:
-        dataset = "coco"
-    elif "ade20k" in model_name_raw:
-        dataset = "ade20k"
-    elif "cityscapes" in model_name_raw:
-        dataset = "cityscapes"
-    else:
-        raise ValueError(
-            f"{model_name_raw} must be wrong since we didn't find 'coco' or 'ade20k' or 'cityscapes' in it "
-        )
-
-    backbone_types = ["tiny", "large"]
-
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
-
-    model_name = f"oneformer_{dataset}_{backbone}_{backbone_type}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description=(
-            "Command line to convert the original oneformer models (with swin backbone) to transformers"
-            " implementation."
-        )
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pth; where <CONFIG_NAME> name must follow the"
-            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml; where <CONFIG_NAME> name must follow the"
-            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=Path,
-        help="Path to the folder to output PyTorch models.",
-    )
-    parser.add_argument(
-        "--oneformer_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to OneFormer's original implementation directory. You can download from here: "
-            "https://github.com/SHI-Labs/OneFormer"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    save_directory: Path = args.pytorch_dump_folder_path
-    oneformer_dir: Path = args.oneformer_dir
-    # append the path to the parents to oneformer dir
-    sys.path.append(str(oneformer_dir.parent))
-    # and import what's needed
-    from OneFormer.oneformer import add_common_config, add_dinat_config, add_oneformer_config, add_swin_config
-    from OneFormer.oneformer.oneformer_model import OneFormer as OriginalOneFormer
-
-    if not save_directory.exists():
-        save_directory.mkdir(parents=True)
-
-    for config_file, checkpoint_file in OriginalOneFormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        processor = OriginalOneFormerConfigToProcessorConverter()(
-            setup_cfg(Args(config_file=config_file)), os.path.join("shi-labs", config_file.stem)
-        )
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        oneformer_kwargs = OriginalOneFormer.from_config(original_config)
-
-        original_model = OriginalOneFormer(**oneformer_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        is_swin = "swin" in config_file.stem
-
-        config: OneFormerConfig = OriginalOneFormerConfigToOursConverter()(original_config, is_swin)
-
-        oneformer = OneFormerModel(config=config).eval()
-
-        converter = OriginalOneFormerCheckpointToOursConverter(original_model, config)
-
-        oneformer = converter.convert(oneformer, is_swin)
-
-        oneformer_for_universal_segmentation = OneFormerForUniversalSegmentation(config=config).eval()
-
-        oneformer_for_universal_segmentation.model = oneformer
-
-        test(
-            original_model,
-            oneformer_for_universal_segmentation,
-            processor,
-            os.path.join("shi-labs", config_file.stem),
-        )
-
-        model_name = get_name(checkpoint_file)
-        logger.info(f"🪄 Saving {model_name}")
-
-        processor.save_pretrained(save_directory / model_name)
-        oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)
-
-        processor.push_to_hub(
-            repo_id=os.path.join("shi-labs", config_file.stem),
-            commit_message="Add configs",
-            use_temp_dir=True,
-        )
-        oneformer_for_universal_segmentation.push_to_hub(
-            repo_id=os.path.join("shi-labs", config_file.stem),
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
diff --git a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 3d5218c20426..000000000000
--- a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if openai_config_file == "":
-        config = OpenAIGPTConfig()
-    else:
-        config = OpenAIGPTConfig.from_json_file(openai_config_file)
-    model = OpenAIGPTModel(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--openai_checkpoint_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--openai_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    args = parser.parse_args()
-    convert_openai_checkpoint_to_pytorch(
-        args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 486b477f973f..000000000000
--- a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OPT checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import torch
-
-from transformers import OPTConfig, OPTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    if "model" in sd.keys():
-        sd = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # pop unnecessary weights
-    keys_to_delete = [
-        "decoder.version",
-        "decoder.output_projection.weight",
-    ]
-    for key in keys_to_delete:
-        if key in sd:
-            sd.pop(key)
-
-    keys_to_rename = {
-        "decoder.project_in_dim.weight": "decoder.project_in.weight",
-        "decoder.project_out_dim.weight": "decoder.project_out.weight",
-        "decoder.layer_norm.weight": "decoder.final_layer_norm.weight",
-        "decoder.layer_norm.bias": "decoder.final_layer_norm.bias",
-    }
-    for old_key, new_key in keys_to_rename.items():
-        if old_key in sd:
-            sd[new_key] = sd.pop(old_key)
-
-    keys = list(sd.keys())
-    for key in keys:
-        if ".qkv_proj." in key:
-            value = sd[key]
-            # We split QKV in separate Q,K,V
-
-            q_name = key.replace(".qkv_proj.", ".q_proj.")
-            k_name = key.replace(".qkv_proj.", ".k_proj.")
-            v_name = key.replace(".qkv_proj.", ".v_proj.")
-
-            depth = value.shape[0]
-            assert depth % 3 == 0
-            # `SequeuceParallelTransformerBlock` has QKV weight is separated in K,V,Q despite the naming:
-            # https://cs.github.com/facebookresearch/metaseq/blob/51871bd73cd04c038f239ea2a26db1d7f6b37927/metaseq/modules/sequence_parallel_transformer_layer.py#L97
-            k, v, q = torch.split(value, depth // 3, dim=0)
-
-            sd[q_name] = q
-            sd[k_name] = k
-            sd[v_name] = v
-            del sd[key]
-
-    return sd
-
-
-@torch.no_grad()
-def convert_opt_checkpoint(checkpoint_path, pytorch_dump_folder_path, config=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    state_dict = load_checkpoint(checkpoint_path)
-
-    if config is not None:
-        config = OPTConfig.from_pretrained(config)
-    else:
-        config = OPTConfig()
-
-    model = OPTModel(config).half().eval()
-    model.load_state_dict(state_dict)
-
-    # Check results
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--fairseq_path",
-        type=str,
-        help=(
-            "path to fairseq checkpoint in correct format. You can find all checkpoints in the correct format here:"
-            " https://huggingface.co/models?other=opt_metasq"
-        ),
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--hf_config", default=None, type=str, help="Define HF config.")
-    args = parser.parse_args()
-    convert_opt_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, config=args.hf_config)
diff --git a/src/transformers/models/owlv2/convert_owlv2_to_hf.py b/src/transformers/models/owlv2/convert_owlv2_to_hf.py
deleted file mode 100644
index ed563b2c5bd0..000000000000
--- a/src/transformers/models/owlv2/convert_owlv2_to_hf.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OWLv2 checkpoints from the original repository.
-
-URL: https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit"""
-
-import argparse
-import collections
-import os
-
-import jax
-import jax.numpy as jnp
-import numpy as np
-import torch
-from flax.training import checkpoints
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    CLIPTokenizer,
-    Owlv2Config,
-    Owlv2ForObjectDetection,
-    Owlv2ImageProcessor,
-    Owlv2Processor,
-    Owlv2TextConfig,
-    Owlv2VisionConfig,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_owlv2_config(model_name):
-    if "large" in model_name:
-        image_size = 1008
-        patch_size = 14
-        vision_hidden_size = 1024
-        vision_intermediate_size = 4096
-        vision_num_hidden_layers = 24
-        vision_num_attention_heads = 16
-        projection_dim = 768
-        text_hidden_size = 768
-        text_intermediate_size = 3072
-        text_num_attention_heads = 12
-        text_num_hidden_layers = 12
-    else:
-        image_size = 960
-        patch_size = 16
-        vision_hidden_size = 768
-        vision_intermediate_size = 3072
-        vision_num_hidden_layers = 12
-        vision_num_attention_heads = 12
-        projection_dim = 512
-        text_hidden_size = 512
-        text_intermediate_size = 2048
-        text_num_attention_heads = 8
-        text_num_hidden_layers = 12
-
-    vision_config = Owlv2VisionConfig(
-        patch_size=patch_size,
-        image_size=image_size,
-        hidden_size=vision_hidden_size,
-        num_hidden_layers=vision_num_hidden_layers,
-        intermediate_size=vision_intermediate_size,
-        num_attention_heads=vision_num_attention_heads,
-    )
-    text_config = Owlv2TextConfig(
-        hidden_size=text_hidden_size,
-        intermediate_size=text_intermediate_size,
-        num_attention_heads=text_num_attention_heads,
-        num_hidden_layers=text_num_hidden_layers,
-    )
-
-    config = Owlv2Config(
-        text_config=text_config.to_dict(),
-        vision_config=vision_config.to_dict(),
-        projection_dim=projection_dim,
-    )
-
-    return config
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, model_name):
-    rename_keys = []
-
-    # fmt: off
-    # CLIP vision encoder
-    rename_keys.append(("backbone/clip/visual/class_embedding", "owlv2.vision_model.embeddings.class_embedding"))
-    rename_keys.append(("backbone/clip/visual/conv1/kernel", "owlv2.vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("backbone/clip/visual/positional_embedding", "owlv2.vision_model.embeddings.position_embedding.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_pre/scale", "owlv2.vision_model.pre_layernorm.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_pre/bias", "owlv2.vision_model.pre_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        if "v2" in model_name:
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_0/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_0/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        else:
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_2/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_2/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_fc/kernel", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_fc/bias", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_proj/kernel", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_proj/bias", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/query/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/query/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/key/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/key/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/value/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/value/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/out/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/out/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("backbone/clip/visual/ln_post/scale", "owlv2.vision_model.post_layernorm.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_post/bias", "owlv2.vision_model.post_layernorm.bias"))
-
-    # CLIP text encoder
-    rename_keys.append(("backbone/clip/text/token_embedding/embedding", "owlv2.text_model.embeddings.token_embedding.weight"))
-    rename_keys.append(("backbone/clip/text/positional_embedding", "owlv2.text_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.text_config.num_hidden_layers):
-        if "v2" in model_name:
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_0/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_0/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.bias"))
-        else:
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_2/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_2/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_fc/kernel", f"owlv2.text_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_fc/bias", f"owlv2.text_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_proj/kernel", f"owlv2.text_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_proj/bias", f"owlv2.text_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/query/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/query/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/key/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/key/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/value/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/value/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/out/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/out/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("backbone/clip/text/ln_final/scale", "owlv2.text_model.final_layer_norm.weight"))
-    rename_keys.append(("backbone/clip/text/ln_final/bias", "owlv2.text_model.final_layer_norm.bias"))
-
-    # logit scale
-    rename_keys.append(("backbone/clip/logit_scale", "owlv2.logit_scale"))
-
-    # projection heads
-    rename_keys.append(("backbone/clip/text/text_projection/kernel", "owlv2.text_projection.weight"))
-
-    # class and box heads
-    rename_keys.append(("backbone/merged_class_token/scale", "layer_norm.weight"))
-    rename_keys.append(("backbone/merged_class_token/bias", "layer_norm.bias"))
-    rename_keys.append(("class_head/Dense_0/kernel", "class_head.dense0.weight"))
-    rename_keys.append(("class_head/Dense_0/bias", "class_head.dense0.bias"))
-    rename_keys.append(("class_head/logit_shift/kernel", "class_head.logit_shift.weight"))
-    rename_keys.append(("class_head/logit_scale/kernel", "class_head.logit_scale.weight"))
-    rename_keys.append(("class_head/logit_scale/bias", "class_head.logit_scale.bias"))
-    rename_keys.append(("class_head/logit_shift/bias", "class_head.logit_shift.bias"))
-    rename_keys.append(("obj_box_head/Dense_0/kernel", "box_head.dense0.weight"))
-    rename_keys.append(("obj_box_head/Dense_0/bias", "box_head.dense0.bias"))
-    rename_keys.append(("obj_box_head/Dense_1/kernel", "box_head.dense1.weight"))
-    rename_keys.append(("obj_box_head/Dense_1/bias", "box_head.dense1.bias"))
-    rename_keys.append(("obj_box_head/Dense_2/kernel", "box_head.dense2.weight"))
-    rename_keys.append(("obj_box_head/Dense_2/bias", "box_head.dense2.bias"))
-
-    # objectness head (only for v2)
-    if "v2" in model_name:
-        rename_keys.append(("objectness_head/Dense_0/kernel", "objectness_head.dense0.weight"))
-        rename_keys.append(("objectness_head/Dense_0/bias", "objectness_head.dense0.bias"))
-        rename_keys.append(("objectness_head/Dense_1/kernel", "objectness_head.dense1.weight"))
-        rename_keys.append(("objectness_head/Dense_1/bias", "objectness_head.dense1.bias"))
-        rename_keys.append(("objectness_head/Dense_2/kernel", "objectness_head.dense2.weight"))
-        rename_keys.append(("objectness_head/Dense_2/bias", "objectness_head.dense2.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_and_reshape_key(dct, old, new, config):
-    val = dct.pop(old)
-
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if "patch_embedding" in new:
-        print("Reshaping patch embedding... for", new)
-        val = val.transpose(3, 2, 0, 1)
-    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
-        val = val.T
-
-    if new.endswith("bias"):
-        val = val.reshape(-1)
-
-    dct[new] = torch.from_numpy(np.array(val))
-
-
-@torch.no_grad()
-def convert_owlv2_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our OWL-ViT structure.
-    """
-    config = get_owlv2_config(model_name)
-
-    # see available checkpoints at https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit#pretrained-checkpoints
-    variables = checkpoints.restore_checkpoint(checkpoint_path, target=None)
-    variables = variables["params"] if "v2" in model_name else variables["optimizer"]["target"]
-    flax_params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, variables)
-    state_dict = flatten_nested_dict(flax_params)
-
-    # Rename keys
-    rename_keys = create_rename_keys(config, model_name)
-    for src, dest in rename_keys:
-        rename_and_reshape_key(state_dict, src, dest, config)
-
-    # load HuggingFace model
-    model = Owlv2ForObjectDetection(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == ["owlv2.visual_projection.weight"]
-    assert unexpected_keys == []
-    model.eval()
-
-    # Initialize image processor
-    size = {"height": config.vision_config.image_size, "width": config.vision_config.image_size}
-    image_processor = Owlv2ImageProcessor(size=size)
-    # Initialize tokenizer
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
-    # Initialize processor
-    processor = Owlv2Processor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # Verify pixel_values and input_ids
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="owlvit_pixel_values_960.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath).permute(0, 3, 1, 2)
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="owlv2_input_ids.pt", repo_type="dataset")
-    original_input_ids = torch.load(filepath).squeeze()
-
-    filepath = hf_hub_download(repo_id="adirik/OWL-ViT", repo_type="space", filename="assets/astronaut.png")
-    image = Image.open(filepath)
-    texts = [["face", "rocket", "nasa badge", "star-spangled banner"]]
-    inputs = processor(text=texts, images=image, return_tensors="pt")
-
-    if "large" not in model_name:
-        assert torch.allclose(inputs.pixel_values, original_pixel_values.float(), atol=1e-6)
-    assert torch.allclose(inputs.input_ids[:4, :], original_input_ids[:4, :], atol=1e-6)
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-        pred_boxes = outputs.pred_boxes
-        objectness_logits = outputs.objectness_logits
-
-    if verify_logits:
-        if model_name == "owlv2-base-patch16":
-            expected_logits = torch.tensor(
-                [[-10.0043, -9.0226, -8.0433], [-12.4569, -14.0380, -12.6153], [-21.0731, -22.2705, -21.8850]]
-            )
-            expected_boxes = torch.tensor(
-                [[0.0136, 0.0223, 0.0269], [0.0406, 0.0327, 0.0797], [0.0638, 0.1539, 0.1255]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-5.6589, -7.7702, -16.3965]],
-            )
-        elif model_name == "owlv2-base-patch16-finetuned":
-            expected_logits = torch.tensor(
-                [[-9.2391, -9.2313, -8.0295], [-14.5498, -16.8450, -14.7166], [-15.1278, -17.3060, -15.7169]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0103, 0.0094, 0.0207], [0.0483, 0.0729, 0.1013], [0.0629, 0.1396, 0.1313]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.5234, -13.3788, -14.6627]],
-            )
-        elif model_name == "owlv2-base-patch16-ensemble":
-            expected_logits = torch.tensor(
-                [[-8.6353, -9.5409, -6.6154], [-7.9442, -9.6151, -6.7117], [-12.4593, -15.3332, -12.1048]]
-            )
-            expected_boxes = torch.tensor(
-                [[0.0126, 0.0090, 0.0238], [0.0387, 0.0227, 0.0754], [0.0582, 0.1058, 0.1139]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.0628, -5.9507, -10.4486]],
-            )
-        elif model_name == "owlv2-large-patch14":
-            expected_logits = torch.tensor(
-                [[-12.6662, -11.8384, -12.1880], [-16.0599, -16.5835, -16.9364], [-21.4957, -26.7038, -25.1313]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0136, 0.0161, 0.0256], [0.0126, 0.0135, 0.0202], [0.0498, 0.0948, 0.0915]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.7196, -9.4590, -13.9472]],
-            )
-        elif model_name == "owlv2-large-patch14-finetuned":
-            expected_logits = torch.tensor(
-                [[-9.5413, -9.7130, -7.9762], [-9.5731, -9.7277, -8.2252], [-15.4434, -19.3084, -16.5490]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0089, 0.0080, 0.0175], [0.0112, 0.0098, 0.0179], [0.0375, 0.0821, 0.0528]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.2655, -6.5845, -11.3105]],
-            )
-        elif model_name == "owlv2-large-patch14-ensemble":
-            expected_logits = torch.tensor(
-                [[-12.2037, -12.2070, -11.5371], [-13.4875, -13.8235, -13.1586], [-18.2007, -22.9834, -20.6816]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0126, 0.0127, 0.0222], [0.0107, 0.0113, 0.0164], [0.0482, 0.1162, 0.0885]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-7.7572, -8.3637, -13.0334]],
-            )
-
-        print("Objectness logits:", objectness_logits[:3, :3])
-        print("Logits:", logits[0, :3, :3])
-        print("Pred boxes:", pred_boxes[0, :3, :3])
-
-        assert torch.allclose(logits[0, :3, :3], expected_logits, atol=1e-3)
-        assert torch.allclose(pred_boxes[0, :3, :3], expected_boxes, atol=1e-3)
-        assert torch.allclose(objectness_logits[:3, :3], expected_objectness_logits, atol=1e-3)
-        print("Looks ok!")
-    else:
-        print("Model converted without verifying logits")
-
-    if pytorch_dump_folder_path is not None:
-        print("Saving model and processor locally...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(f"google/{model_name}")
-        processor.push_to_hub(f"google/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="owlv2-base-patch16",
-        choices=[
-            "owlv2-base-patch16",
-            "owlv2-base-patch16-finetuned",
-            "owlv2-base-patch16-ensemble",
-            "owlv2-large-patch14",
-            "owlv2-large-patch14-finetuned",
-            "owlv2-large-patch14-ensemble",
-        ],
-        type=str,
-        help="Name of the Owlv2 model you'd like to convert from FLAX to PyTorch.",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the original Flax checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_owlv2_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits
-    )
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
deleted file mode 100644
index 1e9fbb950467..000000000000
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OWL-ViT checkpoints from the original repository. URL:
-https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit"""
-
-import argparse
-import collections
-
-import jax
-import jax.numpy as jnp
-import torch
-import torch.nn as nn
-from clip.model import CLIP
-from flax.training import checkpoints
-from huggingface_hub import Repository
-
-from transformers import (
-    CLIPTokenizer,
-    OwlViTConfig,
-    OwlViTForObjectDetection,
-    OwlViTImageProcessor,
-    OwlViTModel,
-    OwlViTProcessor,
-)
-
-
-CONFIGS = {
-    "vit_b32": {
-        "embed_dim": 512,
-        "image_resolution": 768,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 12,
-        "vision_width": 768,
-        "vision_patch_size": 32,
-        "transformer_width": 512,
-        "transformer_heads": 8,
-        "transformer_layers": 12,
-    },
-    "vit_b16": {
-        "embed_dim": 512,
-        "image_resolution": 768,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 12,
-        "vision_width": 768,
-        "vision_patch_size": 16,
-        "transformer_width": 512,
-        "transformer_heads": 8,
-        "transformer_layers": 12,
-    },
-    "vit_l14": {
-        "embed_dim": 768,
-        "image_resolution": 840,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 24,
-        "vision_width": 1024,
-        "vision_patch_size": 14,
-        "transformer_width": 768,
-        "transformer_heads": 12,
-        "transformer_layers": 12,
-    },
-}
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-def to_f32(params):
-    return jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, params)
-
-
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vision_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layernorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-def copy_class_merge_token(hf_model, flax_params):
-    flax_class_token_params = flatten_nested_dict(flax_params["backbone"]["merged_class_token"])
-
-    weight = torch.from_numpy(flax_class_token_params["scale"])
-    bias = torch.from_numpy(flax_class_token_params["bias"])
-    hf_model.layer_norm.weight = nn.Parameter(weight)
-    hf_model.layer_norm.bias = nn.Parameter(bias)
-
-
-def copy_class_box_heads(hf_model, flax_params):
-    pt_params = hf_model.state_dict()
-    new_params = {}
-
-    # Rename class prediction head flax params to pytorch HF
-    flax_class_params = flatten_nested_dict(flax_params["class_head"])
-
-    for flax_key, v in flax_class_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace(".kernel", ".weight")
-        torch_key = torch_key.replace("Dense_0", "dense0")
-        torch_key = "class_head." + torch_key
-
-        if "weight" in torch_key and v.ndim == 2:
-            v = v.T
-
-        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))
-
-    # Rename box prediction box flax params to pytorch HF
-    flax_box_params = flatten_nested_dict(flax_params["obj_box_head"])
-
-    for flax_key, v in flax_box_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace(".kernel", ".weight")
-        torch_key = torch_key.replace("_", "").lower()
-        torch_key = "box_head." + torch_key
-
-        if "weight" in torch_key and v.ndim == 2:
-            v = v.T
-
-        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))
-
-    # Copy flax params to PyTorch params
-    for name, param in new_params.items():
-        if name in pt_params.keys():
-            pt_params[name].copy_(param)
-
-
-def copy_flax_attn_params(hf_backbone, flax_attn_params):
-    for k, v in flax_attn_params.items():
-        if k.startswith("transformer"):
-            torch_key = k.replace("transformer.resblocks", "text_model.encoder.layers")
-        else:
-            torch_key = k.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
-
-        torch_key = torch_key.replace("attn", "self_attn")
-        torch_key = torch_key.replace("key", "k_proj")
-        torch_key = torch_key.replace("value", "v_proj")
-        torch_key = torch_key.replace("query", "q_proj")
-        torch_key = torch_key.replace("out", "out_proj")
-
-        if "bias" in torch_key and v.ndim == 2:
-            shape = v.shape[0] * v.shape[1]
-            v = v.reshape(shape)
-
-        if "weight" in torch_key and "out" in torch_key:
-            shape = (v.shape[0] * v.shape[1], v.shape[2])
-            v = v.reshape(shape).T
-
-        if "weight" in torch_key and "out" not in torch_key:
-            shape = (v.shape[0], v.shape[1] * v.shape[2])
-            v = v.reshape(shape).T
-
-        # Copy flax CLIP attn params to HF PyTorch params
-        v = torch.from_numpy(v)
-        hf_backbone.state_dict()[torch_key].copy_(v)
-
-
-def _convert_attn_layers(params):
-    new_params = {}
-    processed_attn_layers = []
-
-    for k, v in params.items():
-        if "attn." in k:
-            base = k[: k.rindex("attn.") + 5]
-            if base in processed_attn_layers:
-                continue
-
-            processed_attn_layers.append(base)
-            dim = params[base + "out.weight"].shape[-1]
-            new_params[base + "out_proj.weight"] = params[base + "out.weight"].reshape(dim, dim).T
-            new_params[base + "out_proj.bias"] = params[base + "out.bias"]
-        else:
-            new_params[k] = v
-    return new_params
-
-
-def convert_clip_backbone(flax_params, torch_config):
-    torch_model = CLIP(**torch_config)
-    torch_model.eval()
-    torch_clip_params = torch_model.state_dict()
-
-    flax_clip_params = flatten_nested_dict(flax_params["backbone"]["clip"])
-    new_torch_params = {}
-
-    for flax_key, v in flax_clip_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace("text.token_embedding.embedding", "token_embedding.kernel")
-
-        if (
-            torch_key.startswith("text.transformer")
-            or torch_key.startswith("text.text_projection")
-            or torch_key.startswith("text.ln_final")
-            or torch_key.startswith("text.positional_embedding")
-        ):
-            torch_key = torch_key[5:]
-
-        torch_key = torch_key.replace("text_projection.kernel", "text_projection")
-        torch_key = torch_key.replace("visual.proj.kernel", "visual.proj")
-        torch_key = torch_key.replace(".scale", ".weight")
-        torch_key = torch_key.replace(".kernel", ".weight")
-
-        if "conv" in torch_key or "downsample.0.weight" in torch_key:
-            v = v.transpose(3, 2, 0, 1)
-
-        elif "weight" in torch_key and v.ndim == 2 and "embedding" not in torch_key:
-            # Fully connected layers are transposed, embeddings are not
-            v = v.T
-
-        new_torch_params[torch_key] = v
-
-    attn_params = _convert_attn_layers(new_torch_params)
-    new_torch_params.update(attn_params)
-    attn_params = {}
-
-    # Copy flax CLIP backbone params to PyTorch params
-    for name, param in new_torch_params.items():
-        if name in torch_clip_params.keys():
-            new_param = torch.from_numpy(new_torch_params[name])
-            torch_clip_params[name].copy_(new_param)
-        else:
-            attn_params[name] = param
-
-    return torch_clip_params, torch_model, attn_params
-
-
-@torch.no_grad()
-def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}")
-    repo.git_pull()
-
-    if config_path is not None:
-        config = OwlViTConfig.from_pretrained(config_path)
-    else:
-        config = OwlViTConfig()
-
-    hf_backbone = OwlViTModel(config).eval()
-    hf_model = OwlViTForObjectDetection(config).eval()
-
-    copy_text_model_and_projection(hf_backbone, pt_backbone)
-    copy_vision_model_and_projection(hf_backbone, pt_backbone)
-    hf_backbone.logit_scale = pt_backbone.logit_scale
-    copy_flax_attn_params(hf_backbone, attn_params)
-
-    hf_model.owlvit = hf_backbone
-    copy_class_merge_token(hf_model, flax_params)
-    copy_class_box_heads(hf_model, flax_params)
-
-    # Save HF model
-    hf_model.save_pretrained(repo.local_dir)
-
-    # Initialize image processor
-    image_processor = OwlViTImageProcessor(
-        size=config.vision_config.image_size, crop_size=config.vision_config.image_size
-    )
-    # Initialize tokenizer
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
-
-    # Initialize processor
-    processor = OwlViTProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    image_processor.save_pretrained(repo.local_dir)
-    processor.save_pretrained(repo.local_dir)
-
-    repo.git_add()
-    repo.git_commit("Upload model and processor")
-    repo.git_push()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--owlvit_version",
-        default=None,
-        type=str,
-        required=True,
-        help="OWL-ViT model name [clip_b16, clip_b32, clip_l14].",
-    )
-    parser.add_argument(
-        "--owlvit_checkpoint", default=None, type=str, required=True, help="Path to flax model checkpoint."
-    )
-    parser.add_argument("--hf_config", default=None, type=str, required=True, help="Path to HF model config.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default="hf_model", type=str, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-
-    # Initialize PyToch clip model
-    model_name = args.owlvit_version
-    if model_name == "clip_b16":
-        torch_config = CONFIGS["vit_b16"]
-    elif model_name == "clip_b32":
-        torch_config = CONFIGS["vit_b32"]
-    elif model_name == "clip_l14":
-        torch_config = CONFIGS["vit_l14"]
-
-    # Load from checkpoint and convert params to float-32
-    variables = checkpoints.restore_checkpoint(args.owlvit_checkpoint, target=None)["optimizer"]["target"]
-    flax_params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, variables)
-    del variables
-
-    # Convert CLIP backbone
-    pt_backbone_params, clip_pt, attn_params = convert_clip_backbone(flax_params, torch_config)
-
-    convert_owlvit_checkpoint(clip_pt, flax_params, attn_params, args.pytorch_dump_folder_path, args.hf_config)
diff --git a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
deleted file mode 100644
index df869fcefb2b..000000000000
--- a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PaliGemma2 checkpoints from the original repository."""
-
-import argparse
-import collections
-
-import jax.numpy as jnp
-import ml_dtypes
-import numpy as np
-import torch
-
-from transformers import (
-    AutoTokenizer,
-    Gemma2Config,
-    PaliGemmaConfig,
-    PaliGemmaForConditionalGeneration,
-    PaliGemmaProcessor,
-    SiglipImageProcessor,
-)
-from transformers.tokenization_utils_base import AddedToken
-from transformers.utils import logging
-
-
-device = "cpu"
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# TODO add sequence length variations here
-
-PALIGEMMA2_VARIANTS = ["2b-224", "2b-448", "2b-896", "9b-224", "9b-448", "9b-896", "27b-224", "27b-448", "27b-896"]
-VARIANT_CONFIGS = {
-    "2b": {
-        "num_positions": 256,
-        "hidden_size": 2304,
-        "num_hidden_layers": 26,
-        "intermediate_size": 9216,
-        "num_key_value_heads": 4,
-        "num_attention_heads": 8,
-        "head_dim": 256,
-        "query_pre_attn_scalar": 256,
-    },
-    "9b": {
-        "num_positions": 1024,
-        "hidden_size": 3584,
-        "num_hidden_layers": 42,
-        "intermediate_size": 14336,
-        "num_key_value_heads": 8,
-        "num_attention_heads": 16,
-        "head_dim": 256,
-        "query_pre_attn_scalar": 256,
-    },
-    "27b": {
-        "num_positions": 4096,
-        "hidden_size": 4608,
-        "num_hidden_layers": 46,
-        "intermediate_size": 36864,
-        "num_key_value_heads": 16,
-        "num_attention_heads": 32,
-        "head_dim": 128,
-        "query_pre_attn_scalar": 4608 // 32,  # scaling is different for the 28b
-    },
-}
-
-DTYPES = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}
-
-
-def get_paligemma2_config(variant: str, precision: str):
-    config = {
-        "image_token_index": None,
-        "pad_token_id": 0,
-        "bos_token_id": 2,
-        "eos_token_id": 1,
-    }
-    base_variant = variant.split("-")[0]
-
-    if variant in PALIGEMMA2_VARIANTS:
-        image_size = int(variant.split("-")[1])
-        variant_config = VARIANT_CONFIGS[base_variant]
-        patch_size = 14
-        num_image_tokens = (image_size**2) // (patch_size**2)
-        config["projection_dim"] = variant_config["hidden_size"]
-        config["image_token_index"] = 257152
-        config["num_hidden_layers"] = variant_config["num_hidden_layers"]  # For generate
-        text_config = Gemma2Config.from_pretrained("google/gemma-2-2b-it").to_dict()
-        sup_text_config = {
-            "model_type": "gemma2",
-            "vocab_size": 257152,
-            "num_hidden_layers": variant_config["num_hidden_layers"],
-            "num_key_value_heads": variant_config["num_key_value_heads"],
-            "head_dim": variant_config["head_dim"],
-            "torch_dtype": precision,
-            "hidden_size": variant_config["hidden_size"],
-            "hidden_activation": "gelu_pytorch_tanh",
-            "num_attention_heads": variant_config["num_attention_heads"],
-            "intermediate_size": variant_config["intermediate_size"],
-            "is_encoder_decoder": False,
-            "query_pre_attn_scalar": variant_config["query_pre_attn_scalar"],
-        }
-        text_config.update(sup_text_config)
-
-        vision_config = {
-            "num_positions": variant_config["num_positions"],  # not useful, to remove
-            "torch_dtype": precision,
-            "image_size": image_size,
-            "patch_size": patch_size,
-            "num_image_tokens": num_image_tokens,
-            "hidden_size": 1152,
-            "intermediate_size": 4304,
-            "num_hidden_layers": 27,
-            "num_attention_heads": 16,
-            "projection_dim": variant_config["hidden_size"],
-            "hidden_act": "gelu_pytorch_tanh",
-            "vision_use_head": False,
-        }
-        final_config = PaliGemmaConfig(text_config=text_config, vision_config=vision_config, **config)
-    else:
-        raise ValueError(f"Identifier {variant} not supported. Available: {PALIGEMMA2_VARIANTS}")
-    return final_config
-
-
-def slice_state_dict(state_dict, config):
-    # fmt: off
-    # patch embeddings
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"] = state_dict.pop("img/embedding/kernel").transpose(
-        3, 2, 0, 1
-    )
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"] = state_dict.pop("img/embedding/bias")
-    # positional embeddings
-    state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"] = state_dict.pop("img/pos_embedding").reshape(
-        -1, config.vision_config.hidden_size
-    )
-
-    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
-    encoderblock_layernorm0_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/scale")
-    encoderblock_layernorm0_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/bias")
-    encoderblock_layernorm1_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/scale")
-    encoderblock_layernorm1_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/bias")
-
-    encoderblock_mlp_dense0_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel")
-    encoderblock_mlp_dense0_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias")
-    encoderblock_mlp_dense1_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel")
-    encoderblock_mlp_dense1_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias")
-
-    encoderblock_attention_0_key_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel")
-    encoderblock_attention_0_key_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias")
-    encoderblock_attention_0_value_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel")
-    encoderblock_attention_0_value_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias")
-    encoderblock_attention_0_query_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel")
-    encoderblock_attention_0_query_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias")
-    encoderblock_attention_0_out_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel")
-    encoderblock_attention_0_out_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias")
-
-    for i in range(config.vision_config.num_hidden_layers):
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"] = encoderblock_layernorm0_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"] = encoderblock_layernorm0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"] = encoderblock_layernorm1_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"] = encoderblock_layernorm1_bias[i]
-
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"] = encoderblock_mlp_dense0_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"] = encoderblock_mlp_dense0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"] = encoderblock_mlp_dense1_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"] = encoderblock_mlp_dense1_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-
-    state_dict["vision_tower.vision_model.post_layernorm.weight"] = state_dict.pop("img/Transformer/encoder_norm/scale").transpose()
-    state_dict["vision_tower.vision_model.post_layernorm.bias"] = state_dict.pop("img/Transformer/encoder_norm/bias")
-
-    # multimodal projector
-
-    state_dict['multi_modal_projector.linear.weight'] = state_dict.pop("img/head/kernel").transpose()
-    state_dict['multi_modal_projector.linear.bias'] = state_dict.pop("img/head/bias")
-
-    # text decoder (gemma)
-
-    embedding_vector = state_dict.pop("llm/embedder/input_embedding")
-    state_dict["language_model.model.embed_tokens.weight"] = embedding_vector
-
-    # pop the einsum attention + mlp representations. There are 26 layers in gemma2-2b.
-
-    llm_attention_attn_vec_einsum = state_dict.pop("llm/layers/attn/attn_vec_einsum/w")
-    #  (26, 2, 4, 2304, 256) for 2b-224, 4 kv heads and 26 layers
-    llm_attention_kv_einsum = state_dict.pop("llm/layers/attn/kv_einsum/w")
-    llm_attention_q_einsum = state_dict.pop("llm/layers/attn/q_einsum/w")
-    llm_mlp_gating_einsum = state_dict.pop("llm/layers/mlp/gating_einsum")
-    llm_mlp_linear = state_dict.pop("llm/layers/mlp/linear")
-    # TODO verify correctness of layer norm loading
-    llm_input_layernorm = state_dict.pop("llm/layers/pre_attention_norm/scale")
-    llm_pre_feedforward_layernorm = state_dict.pop("llm/layers/pre_ffw_norm/scale")
-
-    llm_post_attention_layernorm = state_dict.pop("llm/layers/post_attention_norm/scale")
-    llm_post_feedforward_layernorm = state_dict.pop("llm/layers/post_ffw_norm/scale")
-
-    for i in range(config.text_config.num_hidden_layers):
-        # llm_attention_q_einsum[i].shape = (8, 2048, 256)
-        # q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        """
-        q shape (8, 2304, 256)
-        k shape (4, 2304, 256)
-        v shape (4, 2304, 256)
-        o shape (8, 256, 2304)
-
-        """
-        q_transpose = (0, 2, 1)
-        k_transpose = (0, 2, 1)
-        v_transpose = (0, 2, 1)
-        o_transpose = (2, 0, 1)
-
-        q_weight_matrices = llm_attention_q_einsum[i].transpose(*q_transpose)
-        q_proj_weight_reshaped = q_weight_matrices
-        q_proj_weight_reshaped = q_proj_weight_reshaped.reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-        state_dict[f"language_model.model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
-        # Shape: (4, 2304, 256)
-        k_weight_matrices = llm_attention_kv_einsum[i, 0].transpose(*k_transpose)
-        k_proj_weight_reshaped = k_weight_matrices.reshape(
-            config.text_config.num_key_value_heads * config.text_config.head_dim,
-            config.text_config.hidden_size
-        )
-        state_dict[f"language_model.model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
-        # llm_attention_kv_einsum[i, 1].shape = (num_key_value_heads, hidden_size, head_dim)
-        v_weight_matrices = llm_attention_kv_einsum[i, 1].transpose(*v_transpose) # Shape: (4, 2304, 256)
-        v_proj_weight_reshaped = v_weight_matrices.reshape(
-            config.text_config.num_key_value_heads * config.text_config.head_dim,
-            config.text_config.hidden_size
-        )
-        state_dict[f"language_model.model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-
-        # output projection.
-
-        # llm_attention_attn_vec_einsum[i].shape = (8, 256, 2304)
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].transpose(*o_transpose).reshape(config.text_config.hidden_size, config.text_config.num_attention_heads * config.text_config.head_dim)
-        state_dict[f"language_model.model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
-        # mlp layers
-        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"language_model.model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
-        up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"language_model.model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"language_model.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
-        state_dict[f"language_model.model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.pre_feedforward_layernorm.weight"] = llm_pre_feedforward_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_feedforward_layernorm.weight"] = llm_post_feedforward_layernorm[i]
-    state_dict["language_model.model.norm.weight"] = state_dict.pop("llm/final_norm/scale")
-    state_dict["language_model.lm_head.weight"] = embedding_vector # weights are tied.
-    [k for k in state_dict.keys() if not k.startswith('vision') and not k.startswith('language')]
-    # fmt: on
-    for key, value in state_dict.items():
-        if not isinstance(value, torch.Tensor):
-            try:
-                if value.dtype == jnp.bfloat16:
-                    value = jnp.array(value).astype(jnp.float32)
-                    value = np.array(value)
-                    state_dict[key] = torch.from_numpy(value).to(torch.bfloat16)
-                else:
-                    state_dict[key] = torch.from_numpy(value)
-            except Exception as initial_exception:
-                raise ValueError(f"Conversion failed from jax weights with {initial_exception}. Check your inputs.")
-    return state_dict
-
-
-def flatten_nested_dict(params, parent_key="", sep="/", precision: int = "float32"):
-    items = []
-
-    for k, v in params.items():
-        k = k.removeprefix("params/")
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, parent_key=new_key, sep=sep, precision=precision).items())
-        else:
-            if precision == "bfloat16":
-                try:
-                    v = v.view(ml_dtypes.bfloat16)
-                except Exception as initial_exception:
-                    raise ValueError(f"Conversion failed from bfloat16 with {initial_exception}, check your inputs.")
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_paligemma2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    variant: str,
-    precision: str,
-    do_convert_weights=False,
-):
-    """
-    Read checkpoints from flax npz files, rename/reshape, send result to state dict and verify logits if needed.
-    """
-    config = get_paligemma2_config(variant, precision=precision)
-    if do_convert_weights:
-        tokenizer_id = "google/paligemma-3b-pt-224"  # same tokenizer as paligemma 1
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-        image_token = AddedToken("<image>", normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
-
-        # tokenizer.padding_side = 'right' # uncomment for testing purposes only.
-
-        image_processor = SiglipImageProcessor.from_pretrained("google/paligemma-3b-pt-224")
-        image_processor.size = {"width": config.vision_config.image_size, "height": config.vision_config.image_size}
-        image_processor.image_seq_length = config.vision_config.num_image_tokens
-
-        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        data = jnp.load(checkpoint_path)
-        state_dict = flatten_nested_dict(data, precision=precision)
-        del data
-        state_dict_transformers = slice_state_dict(state_dict, config)
-        del state_dict
-        del config.hidden_size  # this key is unused
-        model = PaliGemmaForConditionalGeneration(config).to(device).eval()
-        model.load_state_dict(state_dict_transformers)
-        del state_dict_transformers
-        model.config.text_config._attn_implementation = "sdpa"
-
-        # model expansion to get random embeds of image tokens
-        pad_shape = 64  # for performance reasons
-        pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-        mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-        n = pre_expansion_embeddings.size()[0]
-        sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-        dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-        # We add an image token so we resize the model
-        model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-        model.language_model.model.embed_tokens.weight.data[257152:] = torch.stack(
-            tuple(
-                (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[257152:].shape[0]))
-            ),
-            dim=0,
-        )
-        model.language_model.lm_head.weight.data[257152:] = torch.stack(
-            tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[257152:].shape[0]))),
-            dim=0,
-        )
-        # convert to needed precision
-
-        model.to(DTYPES[precision])
-        model.save_pretrained(pytorch_dump_folder_path, safe_serialization=True)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    else:
-        processor = PaliGemmaProcessor.from_pretrained(pytorch_dump_folder_path, do_rescale=False)
-        model = (
-            PaliGemmaForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, attn_implementation="sdpa")
-            .to(device)
-            .eval()
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        required=True,
-        type=str,
-        help="Path to the .npz checkpoint",
-    )
-
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=str,
-        help="Path to the output directory where model and processor will be saved.",
-    )
-
-    parser.add_argument(
-        "--precision",
-        choices=["float32", "bfloat16", "float16"],
-        type=str,
-        help="Precision identifier for model conversion - should match the base checkpoint precision.",
-    )
-
-    parser.add_argument(
-        "--variant",
-        default="2b-224",
-        choices=PALIGEMMA2_VARIANTS,
-        type=str,
-        help="String identifier of the paligemma2 variant to convert.",
-    )
-
-    parser.add_argument(
-        "--do_convert_weights", action="store_true", help="Whether or not to reload and convert the weights."
-    )
-
-    args = parser.parse_args()
-    convert_paligemma2_checkpoint(
-        checkpoint_path=args.checkpoint_path,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        variant=args.variant,
-        precision=args.precision,
-        do_convert_weights=args.do_convert_weights,
-    )
diff --git a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
deleted file mode 100644
index bcea5372e57a..000000000000
--- a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PaliGemma checkpoints from the original repository."""
-
-import argparse
-import collections
-
-import torch
-from numpy import load
-
-from transformers import (
-    AutoTokenizer,
-    GemmaTokenizer,
-    GemmaTokenizerFast,
-    PaliGemmaConfig,
-    PaliGemmaForConditionalGeneration,
-    PaliGemmaProcessor,
-    SiglipImageProcessor,
-)
-from transformers.tokenization_utils_base import AddedToken
-from transformers.utils import logging
-
-
-device = "cuda"  # "cpu"
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# TODO add sequence length variations here
-
-PALIGEMMA_VARIANTS = ["2b-test", "3b-224px", "3b-448px", "3b-896px"]
-
-
-def get_paligemma_config(variant: str, precision: str):
-    config = {
-        "image_token_index": None,
-        "pad_token_id": 0,
-        "bos_token_id": 2,
-        "eos_token_id": 1,
-    }
-
-    image_sizes = {"2b-test": 224, "3b-224px": 224, "3b-448px": 448, "3b-896px": 896}
-
-    if variant in PALIGEMMA_VARIANTS:
-        image_size = image_sizes[variant]
-        patch_size = 14
-        num_image_tokens = (image_size**2) // (patch_size**2)
-
-        config["image_token_index"] = 257152 if variant != "2b-test" else 256000
-        text_config = {
-            "vocab_size": 257152,
-            "num_hidden_layers": 18,
-            "num_key_value_heads": 1,
-            "head_dim": 256,
-            "torch_dtype": precision,
-            "hidden_size": 2048,
-            "hidden_activation": "gelu_pytorch_tanh",
-            "num_attention_heads": 8,
-            "intermediate_size": 16384,
-            "is_encoder_decoder": False,
-        }
-        vision_config = {
-            "torch_dtype": precision,
-            "image_size": image_size,
-            "patch_size": patch_size,
-            "num_image_tokens": num_image_tokens,
-            "hidden_size": 1152,
-            "intermediate_size": 4304,
-            "num_hidden_layers": 27,
-            "num_attention_heads": 16,
-            "projector_hidden_act": "gelu_fast",
-            "vision_use_head": False,
-        }
-        final_config = PaliGemmaConfig(text_config=text_config, vision_config=vision_config, **config)
-    else:
-        raise ValueError(f"Identifier {variant} not supported. Available: {PALIGEMMA_VARIANTS}")
-    return final_config
-
-
-def slice_state_dict(state_dict, config):
-    # fmt: off
-    # patch embeddings
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"] = state_dict.pop("img/embedding/kernel").transpose(
-        3, 2, 0, 1
-    )
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"] = state_dict.pop("img/embedding/bias")
-    # positional embeddings
-    state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"] = state_dict.pop("img/pos_embedding").reshape(
-        -1, config.vision_config.hidden_size
-    )
-
-    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
-    encoderblock_layernorm0_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/scale")
-    encoderblock_layernorm0_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/bias")
-    encoderblock_layernorm1_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/scale")
-    encoderblock_layernorm1_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/bias")
-
-    encoderblock_mlp_dense0_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel")
-    encoderblock_mlp_dense0_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias")
-    encoderblock_mlp_dense1_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel")
-    encoderblock_mlp_dense1_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias")
-
-    encoderblock_attention_0_key_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel")
-    encoderblock_attention_0_key_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias")
-    encoderblock_attention_0_value_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel")
-    encoderblock_attention_0_value_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias")
-    encoderblock_attention_0_query_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel")
-    encoderblock_attention_0_query_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias")
-    encoderblock_attention_0_out_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel")
-    encoderblock_attention_0_out_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias")
-
-    for i in range(config.vision_config.num_hidden_layers):
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"] = encoderblock_layernorm0_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"] = encoderblock_layernorm0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"] = encoderblock_layernorm1_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"] = encoderblock_layernorm1_bias[i]
-
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"] = encoderblock_mlp_dense0_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"] = encoderblock_mlp_dense0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"] = encoderblock_mlp_dense1_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"] = encoderblock_mlp_dense1_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-
-    state_dict["vision_tower.vision_model.post_layernorm.weight"] = state_dict.pop("img/Transformer/encoder_norm/scale").transpose()
-    state_dict["vision_tower.vision_model.post_layernorm.bias"] = state_dict.pop("img/Transformer/encoder_norm/bias")
-
-    # multimodal projector
-
-    state_dict['multi_modal_projector.linear.weight'] = state_dict.pop("img/head/kernel").transpose()
-    state_dict['multi_modal_projector.linear.bias'] = state_dict.pop("img/head/bias")
-
-    # text decoder (gemma)
-
-    embedding_vector = state_dict.pop("llm/embedder/input_embedding")
-    state_dict["language_model.model.embed_tokens.weight"] = embedding_vector
-
-    # pop the einsum attention + mlp representations. There are 18 layers in gemma-2b.
-
-    llm_attention_attn_vec_einsum = state_dict.pop("llm/layers/attn/attn_vec_einsum/w")
-    llm_attention_kv_einsum = state_dict.pop("llm/layers/attn/kv_einsum/w")
-    llm_attention_q_einsum = state_dict.pop("llm/layers/attn/q_einsum/w")
-
-    llm_mlp_gating_einsum = state_dict.pop("llm/layers/mlp/gating_einsum")
-    llm_mlp_linear = state_dict.pop("llm/layers/mlp/linear")
-    # TODO verify correctness of layer norm loading
-
-    llm_input_layernorm = state_dict.pop("llm/layers/pre_attention_norm/scale")
-    llm_post_attention_layernorm = state_dict.pop("llm/layers/pre_ffw_norm/scale")
-
-    for i in range(config.text_config.num_hidden_layers):
-        # llm_attention_q_einsum[i].shape = (8, 2048, 256)
-        q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        state_dict[f"language_model.model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
-
-        # llm_attention_kv_einsum[i, 0, 0].shape = (2048, 256)
-        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
-        state_dict[f"language_model.model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
-        # llm_attention_kv_einsum[i, 1, 0].shape = (2048, 256)
-        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
-        state_dict[f"language_model.model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-
-        # output projection.
-
-        # llm_attention_attn_vec_einsum[i].shape = (8, 256, 2048)
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].transpose(2, 0, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        state_dict[f"language_model.model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
-        # mlp layers
-        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"language_model.model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
-        up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"language_model.model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"language_model.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
-        state_dict[f"language_model.model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
-
-    state_dict["language_model.model.norm.weight"] = state_dict.pop("llm/final_norm/scale")
-    state_dict["language_model.lm_head.weight"] = embedding_vector # weights are tied.
-
-    # fmt: on
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-    return state_dict
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        k = k.removeprefix("params/")
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, parent_key=new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_paligemma_checkpoint(
-    checkpoint_path,
-    tokenizer_model_file,
-    pytorch_dump_folder_path,
-    variant: str,
-    precision: str,
-    do_convert_weights=False,
-):
-    """
-    Read checkpoints from flax npz files, rename/reshape, send result to state dict and verify logits if needed.
-    """
-    config = get_paligemma_config(variant, precision=precision)
-    if do_convert_weights:
-        if variant == "2b-test":
-            # for the test model, the vocabulary was smaller
-            tokenizer_id = "google/gemma-2b"
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-        else:
-            tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-            tokenizer = tokenizer_class(tokenizer_model_file)
-        image_token = AddedToken("<image>", normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
-
-        # tokenizer.padding_side = 'right' # uncomment for testing purposes only.
-
-        image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
-        image_processor.size = {"width": config.vision_config.image_size, "height": config.vision_config.image_size}
-        image_processor.image_seq_length = config.vision_config.num_image_tokens
-
-        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        data = load(checkpoint_path)
-        state_dict = flatten_nested_dict(data)
-        del data
-        state_dict_transformers = slice_state_dict(state_dict, config)
-        del state_dict
-
-        model = PaliGemmaForConditionalGeneration(config).to(device).eval()
-        model.load_state_dict(state_dict_transformers)
-        del state_dict_transformers
-
-    else:
-        processor = PaliGemmaProcessor.from_pretrained(pytorch_dump_folder_path)
-        model = (
-            PaliGemmaForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, attn_implementation="sdpa")
-            .to(device)
-            .eval()
-        )
-    model.config.text_config._attn_implementation = "sdpa"
-
-    # model expansion to get random embeds of image tokens
-    pad_shape = 64  # for performance reasons
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[257152:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[257152:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[257152:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[257152:].shape[0]))),
-        dim=0,
-    )
-
-    model.save_pretrained(pytorch_dump_folder_path, max_shard_size="2GB", safe_serialization=True)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-#
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        required=True,
-        type=str,
-        help="Path to the .npz checkpoint",
-    )
-
-    parser.add_argument(
-        "--tokenizer_model_file",
-        required=True,
-        type=str,
-        help="Path to the sentencepiece tokenizer.model file",
-    )
-
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=str,
-        help="Path to the output directory where model and processor will be saved.",
-    )
-
-    parser.add_argument(
-        "--precision",
-        choices=["float32", "bfloat16", "float16"],
-        type=str,
-        help="Precision identifier for model conversion - should match the base checkpoint precision.",
-    )
-
-    parser.add_argument(
-        "--variant",
-        default="2b-test",
-        choices=PALIGEMMA_VARIANTS,
-        type=str,
-        help="String identifier of the paligemma variant to convert.",
-    )
-
-    parser.add_argument(
-        "--do_convert_weights", action="store_true", help="Whether or not to reload and convert the weights."
-    )
-
-    args = parser.parse_args()
-    convert_paligemma_checkpoint(
-        checkpoint_path=args.checkpoint_path,
-        tokenizer_model_file=args.tokenizer_model_file,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        variant=args.variant,
-        precision=args.precision,
-        do_convert_weights=args.do_convert_weights,
-    )
diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
deleted file mode 100644
index cf183b590c1b..000000000000
--- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from typing import Dict
-
-import tensorflow as tf
-import torch
-from tqdm import tqdm
-
-from transformers import PegasusConfig, PegasusForConditionalGeneration, PegasusTokenizer
-from transformers.models.pegasus.configuration_pegasus import DEFAULTS, task_specific_params
-
-
-PATTERNS = [
-    # replace left string with right string to get the relevant state_dict key (identical state dict to bart)
-    ["memory_attention", "encoder_attn"],
-    ["attention", "attn"],
-    ["/", "."],
-    [".LayerNorm.gamma", "_layer_norm.weight"],
-    [".LayerNorm.beta", "_layer_norm.bias"],
-    ["r.layer_", "r.layers."],
-    ["output_proj", "out_proj"],
-    ["ffn.dense_1.", "fc2."],
-    ["ffn.dense.", "fc1."],
-    ["ffn_layer_norm", "final_layer_norm"],
-    ["kernel", "weight"],
-    ["encoder_layer_norm.", "encoder.layer_norm."],
-    ["decoder_layer_norm.", "decoder.layer_norm."],
-    ["embeddings.weights", "shared.weight"],
-]
-
-
-def rename_state_dict_key(k):
-    for pegasus_name, hf_name in PATTERNS:
-        k = k.replace(pegasus_name, hf_name)
-    return k
-
-
-# See appendix C of paper for all hyperparams
-
-
-def convert_pegasus(tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration:
-    cfg_kwargs = DEFAULTS.copy()
-    cfg_kwargs.update(cfg_updates)
-    cfg = PegasusConfig(**cfg_kwargs)
-    torch_model = PegasusForConditionalGeneration(cfg)
-    sd = torch_model.model.state_dict()
-    mapping = {}
-    for k, v in tf_weights.items():
-        new_k = rename_state_dict_key(k)
-        if new_k not in sd:
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-
-        if "dense" in k or "proj" in new_k:
-            v = v.T
-        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
-        assert v.shape == sd[new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
-    # make sure embedding.padding_idx is respected
-    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(mapping["shared.weight"][cfg.pad_token_id + 1])
-    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
-    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
-    empty_biases = {k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping}
-    mapping.update(**empty_biases)
-    missing, extra = torch_model.model.load_state_dict(mapping, strict=False)
-    unexpected_missing = [
-        k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
-    ]
-    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
-    assert extra == [], f"no matches found for the following tf keys {extra}"
-    return torch_model
-
-
-def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
-    init_vars = tf.train.list_variables(path)
-    tf_weights = {}
-    ignore_name = ["Adafactor", "global_step"]
-    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any(pat in name for pat in ignore_name)
-        if skip_key:
-            continue
-        array = tf.train.load_variable(path, name)
-        tf_weights[name] = array
-    return tf_weights
-
-
-def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
-    # save tokenizer first
-    dataset = Path(ckpt_path).parent.name
-    desired_max_model_length = task_specific_params[f"summarization_{dataset}"]["max_position_embeddings"]
-    tok = PegasusTokenizer.from_pretrained("sshleifer/pegasus", model_max_length=desired_max_model_length)
-    assert tok.model_max_length == desired_max_model_length
-    tok.save_pretrained(save_dir)
-
-    # convert model
-    tf_weights = get_tf_weights_as_numpy(ckpt_path)
-    cfg_updates = task_specific_params[f"summarization_{dataset}"]
-    if dataset == "large":
-        cfg_updates["task_specific_params"] = task_specific_params
-    torch_model = convert_pegasus(tf_weights, cfg_updates)
-    torch_model.save_pretrained(save_dir)
-    sd = torch_model.state_dict()
-    sd.pop("model.decoder.embed_positions.weight")
-    sd.pop("model.encoder.embed_positions.weight")
-    torch.save(sd, Path(save_dir) / "pytorch_model.bin")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
-    parser.add_argument("save_dir", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    if args.save_dir is None:
-        dataset = Path(args.tf_ckpt_path).parent.name
-        args.save_dir = os.path.join("pegasus", dataset)
-    convert_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir)
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
deleted file mode 100644
index 082b9449374a..000000000000
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Perceiver checkpoints originally implemented in Haiku."""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import haiku as hk
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    PerceiverConfig,
-    PerceiverForImageClassificationConvProcessing,
-    PerceiverForImageClassificationFourier,
-    PerceiverForImageClassificationLearned,
-    PerceiverForMaskedLM,
-    PerceiverForMultimodalAutoencoding,
-    PerceiverForOpticalFlow,
-    PerceiverImageProcessor,
-    PerceiverTokenizer,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def prepare_img():
-    # We will verify our results on an image of a dog
-    url = "https://storage.googleapis.com/perceiver_io/dalmation.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def rename_keys(state_dict, architecture):
-    for name in list(state_dict):
-        param = state_dict.pop(name)
-
-        # PREPROCESSORS
-        # rename text preprocessor embeddings (for MLM model)
-        name = name.replace("embed/embeddings", "input_preprocessor.embeddings.weight")
-        if name.startswith("trainable_position_encoding/pos_embs"):
-            name = name.replace(
-                "trainable_position_encoding/pos_embs", "input_preprocessor.position_embeddings.weight"
-            )
-
-        # rename image preprocessor embeddings (for image classification model with learned position embeddings)
-        name = name.replace("image_preprocessor/~/conv2_d/w", "input_preprocessor.convnet_1x1.weight")
-        name = name.replace("image_preprocessor/~/conv2_d/b", "input_preprocessor.convnet_1x1.bias")
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/trainable_position_encoding/pos_embs",
-            "input_preprocessor.position_embeddings.position_embeddings",
-        )
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/w",
-            "input_preprocessor.positions_projection.weight",
-        )
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/b",
-            "input_preprocessor.positions_projection.bias",
-        )
-
-        # rename image preprocessor embeddings (for image classification model with conv processing)
-        if "counter" in name or "hidden" in name:
-            continue
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/conv/w", "input_preprocessor.convnet.conv.weight"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/offset", "input_preprocessor.convnet.batchnorm.bias"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/scale", "input_preprocessor.convnet.batchnorm.weight"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/mean_ema/average",
-            "input_preprocessor.convnet.batchnorm.running_mean",
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/var_ema/average",
-            "input_preprocessor.convnet.batchnorm.running_var",
-        )
-
-        # rename image preprocessor embeddings (for optical flow model)
-        name = name.replace("image_preprocessor/patches_linear/b", "input_preprocessor.conv_after_patches.bias")
-        name = name.replace("image_preprocessor/patches_linear/w", "input_preprocessor.conv_after_patches.weight")
-
-        # rename multimodal preprocessor embeddings
-        name = name.replace("multimodal_preprocessor/audio_mask_token/pos_embs", "input_preprocessor.mask.audio")
-        name = name.replace("multimodal_preprocessor/audio_padding/pos_embs", "input_preprocessor.padding.audio")
-        name = name.replace("multimodal_preprocessor/image_mask_token/pos_embs", "input_preprocessor.mask.image")
-        name = name.replace("multimodal_preprocessor/image_padding/pos_embs", "input_preprocessor.padding.image")
-        name = name.replace("multimodal_preprocessor/label_mask_token/pos_embs", "input_preprocessor.mask.label")
-        name = name.replace("multimodal_preprocessor/label_padding/pos_embs", "input_preprocessor.padding.label")
-
-        # DECODERS
-        # rename prefix of decoders
-        # multimodal autoencoding model
-        name = name.replace(
-            "multimodal_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("multimodal_decoder/~decoder_query/audio_padding/pos_embs", "decoder.padding.audio")
-        name = name.replace("multimodal_decoder/~decoder_query/image_padding/pos_embs", "decoder.padding.image")
-        name = name.replace("multimodal_decoder/~decoder_query/label_padding/pos_embs", "decoder.padding.label")
-        name = name.replace("multimodal_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        name = name.replace("multimodal_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        if architecture == "multimodal_autoencoding":
-            name = name.replace(
-                "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs",
-                "decoder.modalities.label.decoder.output_position_encodings.position_embeddings",
-            )
-        # flow model
-        name = name.replace(
-            "flow_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("flow_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        name = name.replace("flow_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        # image models
-        name = name.replace(
-            "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs",
-            "decoder.decoder.output_position_encodings.position_embeddings",
-        )
-        name = name.replace(
-            "basic_decoder/~/trainable_position_encoding/pos_embs",
-            "decoder.output_position_encodings.position_embeddings",
-        )
-        name = name.replace(
-            "classification_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("classification_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        name = name.replace("classification_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        name = name = name.replace("classification_decoder/~/basic_decoder/~/", "decoder.decoder.")
-        name = name.replace("basic_decoder/cross_attention/", "decoder.decoding_cross_attention.")
-        name = name.replace("basic_decoder/~/", "decoder.")
-
-        # POSTPROCESSORS
-        name = name.replace(
-            "projection_postprocessor/linear/b", "output_postprocessor.modalities.image.classifier.bias"
-        )
-        name = name.replace(
-            "projection_postprocessor/linear/w", "output_postprocessor.modalities.image.classifier.weight"
-        )
-        name = name.replace(
-            "classification_postprocessor/linear/b", "output_postprocessor.modalities.label.classifier.bias"
-        )
-        name = name.replace(
-            "classification_postprocessor/linear/w", "output_postprocessor.modalities.label.classifier.weight"
-        )
-        name = name.replace("audio_postprocessor/linear/b", "output_postprocessor.modalities.audio.classifier.bias")
-        name = name.replace("audio_postprocessor/linear/w", "output_postprocessor.modalities.audio.classifier.weight")
-
-        # PERCEIVER MODEL
-
-        # rename latent embeddings
-        name = name.replace("perceiver_encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents")
-        # rename latent embeddings (for multimodal model)
-        name = name.replace("encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents")
-
-        # rename prefixes
-        if name.startswith("perceiver_encoder/~/"):
-            if "self_attention" in name:
-                suffix = "self_attends."
-            else:
-                suffix = ""
-            name = name.replace("perceiver_encoder/~/", "encoder." + suffix)
-        if name.startswith("encoder/~/"):
-            if "self_attention" in name:
-                suffix = "self_attends."
-            else:
-                suffix = ""
-            name = name.replace("encoder/~/", "encoder." + suffix)
-        # rename layernorm parameters
-        if "offset" in name:
-            name = name.replace("offset", "bias")
-        if "scale" in name:
-            name = name.replace("scale", "weight")
-        # in HuggingFace, the layernorm in between attention + MLP is just called "layernorm"
-        # rename layernorm in between attention + MLP of cross-attention
-        if "cross_attention" in name and "layer_norm_2" in name:
-            name = name.replace("layer_norm_2", "layernorm")
-        # rename layernorm in between attention + MLP of self-attention
-        if "self_attention" in name and "layer_norm_1" in name:
-            name = name.replace("layer_norm_1", "layernorm")
-
-        # in HuggingFace, the layernorms for queries + keys are called "layernorm1" and "layernorm2"
-        if "cross_attention" in name and "layer_norm_1" in name:
-            name = name.replace("layer_norm_1", "attention.self.layernorm2")
-        if "cross_attention" in name and "layer_norm" in name:
-            name = name.replace("layer_norm", "attention.self.layernorm1")
-        if "self_attention" in name and "layer_norm" in name:
-            name = name.replace("layer_norm", "attention.self.layernorm1")
-
-        # rename special characters by dots
-        name = name.replace("-", ".")
-        name = name.replace("/", ".")
-        # rename keys, queries, values and output of attention layers
-        if ("cross_attention" in name or "self_attention" in name) and "mlp" not in name:
-            if "linear.b" in name:
-                name = name.replace("linear.b", "self.query.bias")
-            if "linear.w" in name:
-                name = name.replace("linear.w", "self.query.weight")
-            if "linear_1.b" in name:
-                name = name.replace("linear_1.b", "self.key.bias")
-            if "linear_1.w" in name:
-                name = name.replace("linear_1.w", "self.key.weight")
-            if "linear_2.b" in name:
-                name = name.replace("linear_2.b", "self.value.bias")
-            if "linear_2.w" in name:
-                name = name.replace("linear_2.w", "self.value.weight")
-            if "linear_3.b" in name:
-                name = name.replace("linear_3.b", "output.dense.bias")
-            if "linear_3.w" in name:
-                name = name.replace("linear_3.w", "output.dense.weight")
-        if "self_attention_" in name:
-            name = name.replace("self_attention_", "")
-        if "self_attention" in name:
-            name = name.replace("self_attention", "0")
-        # rename dense layers of 2-layer MLP
-        if "mlp" in name:
-            if "linear.b" in name:
-                name = name.replace("linear.b", "dense1.bias")
-            if "linear.w" in name:
-                name = name.replace("linear.w", "dense1.weight")
-            if "linear_1.b" in name:
-                name = name.replace("linear_1.b", "dense2.bias")
-            if "linear_1.w" in name:
-                name = name.replace("linear_1.w", "dense2.weight")
-
-        # finally, TRANSPOSE if kernel and not embedding layer, and set value
-        if name[-6:] == "weight" and "embeddings" not in name:
-            param = np.transpose(param)
-
-        # if batchnorm, we need to squeeze it
-        if "batchnorm" in name:
-            param = np.squeeze(param)
-
-        if "embedding_decoder" not in name:
-            state_dict["perceiver." + name] = torch.from_numpy(param)
-        else:
-            state_dict[name] = torch.from_numpy(param)
-
-
-@torch.no_grad()
-def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"):
-    """
-    Copy/paste/tweak model's weights to our Perceiver structure.
-    """
-
-    # load parameters as FlatMapping data structure
-    with open(pickle_file, "rb") as f:
-        checkpoint = pickle.loads(f.read())
-
-    state = None
-    if isinstance(checkpoint, dict) and architecture in [
-        "image_classification",
-        "image_classification_fourier",
-        "image_classification_conv",
-    ]:
-        # the image classification_conv checkpoint also has batchnorm states (running_mean and running_var)
-        params = checkpoint["params"]
-        state = checkpoint["state"]
-    else:
-        params = checkpoint
-
-    # turn into initial state dict
-    state_dict = {}
-    for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items():
-        for param_name, param in parameters.items():
-            state_dict[scope_name + "/" + param_name] = param
-
-    if state is not None:
-        # add state variables
-        for scope_name, parameters in hk.data_structures.to_mutable_dict(state).items():
-            for param_name, param in parameters.items():
-                state_dict[scope_name + "/" + param_name] = param
-
-    # rename keys
-    rename_keys(state_dict, architecture=architecture)
-
-    # load HuggingFace model
-    config = PerceiverConfig()
-    subsampling = None
-    repo_id = "huggingface/label-files"
-    if architecture == "MLM":
-        config.qk_channels = 8 * 32
-        config.v_channels = 1280
-        model = PerceiverForMaskedLM(config)
-    elif "image_classification" in architecture:
-        config.num_latents = 512
-        config.d_latents = 1024
-        config.d_model = 512
-        config.num_blocks = 8
-        config.num_self_attends_per_block = 6
-        config.num_cross_attention_heads = 1
-        config.num_self_attention_heads = 8
-        config.qk_channels = None
-        config.v_channels = None
-        # set labels
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        if architecture == "image_classification":
-            config.image_size = 224
-            model = PerceiverForImageClassificationLearned(config)
-        elif architecture == "image_classification_fourier":
-            config.d_model = 261
-            model = PerceiverForImageClassificationFourier(config)
-        elif architecture == "image_classification_conv":
-            config.d_model = 322
-            model = PerceiverForImageClassificationConvProcessing(config)
-        else:
-            raise ValueError(f"Architecture {architecture} not supported")
-    elif architecture == "optical_flow":
-        config.num_latents = 2048
-        config.d_latents = 512
-        config.d_model = 322
-        config.num_blocks = 1
-        config.num_self_attends_per_block = 24
-        config.num_self_attention_heads = 16
-        config.num_cross_attention_heads = 1
-        model = PerceiverForOpticalFlow(config)
-    elif architecture == "multimodal_autoencoding":
-        config.num_latents = 28 * 28 * 1
-        config.d_latents = 512
-        config.d_model = 704
-        config.num_blocks = 1
-        config.num_self_attends_per_block = 8
-        config.num_self_attention_heads = 8
-        config.num_cross_attention_heads = 1
-        config.num_labels = 700
-        # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data)
-        images = torch.randn((1, 16, 3, 224, 224))
-        audio = torch.randn((1, 30720, 1))
-        nchunks = 128
-        image_chunk_size = np.prod((16, 224, 224)) // nchunks
-        audio_chunk_size = audio.shape[1] // config.samples_per_patch // nchunks
-        # process the first chunk
-        chunk_idx = 0
-        subsampling = {
-            "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
-            "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
-            "label": None,
-        }
-        model = PerceiverForMultimodalAutoencoding(config)
-        # set labels
-        filename = "kinetics700-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        raise ValueError(f"Architecture {architecture} not supported")
-    model.eval()
-
-    # load weights
-    model.load_state_dict(state_dict)
-
-    # prepare dummy input
-    input_mask = None
-    if architecture == "MLM":
-        tokenizer = PerceiverTokenizer.from_pretrained("/Users/NielsRogge/Documents/Perceiver/Tokenizer files")
-        text = "This is an incomplete sentence where some words are missing."
-        encoding = tokenizer(text, padding="max_length", return_tensors="pt")
-        # mask " missing.". Note that the model performs much better if the masked chunk starts with a space.
-        encoding.input_ids[0, 51:60] = tokenizer.mask_token_id
-        inputs = encoding.input_ids
-        input_mask = encoding.attention_mask
-    elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        image_processor = PerceiverImageProcessor()
-        image = prepare_img()
-        encoding = image_processor(image, return_tensors="pt")
-        inputs = encoding.pixel_values
-    elif architecture == "optical_flow":
-        inputs = torch.randn(1, 2, 27, 368, 496)
-    elif architecture == "multimodal_autoencoding":
-        images = torch.randn((1, 16, 3, 224, 224))
-        audio = torch.randn((1, 30720, 1))
-        inputs = {"image": images, "audio": audio, "label": torch.zeros((images.shape[0], 700))}
-
-    # forward pass
-    if architecture == "multimodal_autoencoding":
-        outputs = model(inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling)
-    else:
-        outputs = model(inputs=inputs, attention_mask=input_mask)
-    logits = outputs.logits
-
-    # verify logits
-    if not isinstance(logits, dict):
-        print("Shape of logits:", logits.shape)
-    else:
-        for k, v in logits.items():
-            print(f"Shape of logits of modality {k}", v.shape)
-
-    if architecture == "MLM":
-        expected_slice = torch.tensor(
-            [[-11.8336, -11.6850, -11.8483], [-12.8149, -12.5863, -12.7904], [-12.8440, -12.6410, -12.8646]]
-        )
-        assert torch.allclose(logits[0, :3, :3], expected_slice)
-        masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist()
-        expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52]
-        assert masked_tokens_predictions == expected_list
-        print("Greedy predictions:")
-        print(masked_tokens_predictions)
-        print()
-        print("Predicted string:")
-        print(tokenizer.decode(masked_tokens_predictions))
-
-    elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-
-    # Finally, save files
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pickle_file",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to local pickle file of a Perceiver checkpoint you'd like to convert.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model directory, provided as a string.",
-    )
-    parser.add_argument(
-        "--architecture",
-        default="MLM",
-        type=str,
-        help="""
-        Architecture, provided as a string. One of 'MLM', 'image_classification', image_classification_fourier',
-        image_classification_fourier', 'optical_flow' or 'multimodal_autoencoding'.
-        """,
-    )
-
-    args = parser.parse_args()
-    convert_perceiver_checkpoint(args.pickle_file, args.pytorch_dump_folder_path, args.architecture)
diff --git a/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py b/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
deleted file mode 100644
index 6cd61b9f71c8..000000000000
--- a/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import flatdict
-import torch
-
-from transformers import LlamaTokenizer, PersimmonConfig, PersimmonForCausalLM
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage:
-
-```
-git clone https://github.com/persimmon-ai-labs/adept-inference
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py  --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import PersimmonForCausalLM, PersimmonTokenizer
-
-model = PersimmonForCausalLM.from_pretrained("/output/path")
-tokenizer = PersimmonTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "self_attention": "self_attn",
-    "language_model.encoder": "model",
-    "word_embeddings_for_head": "lm_head",
-    "language_model.embedding.word_embeddings": "model.embed_tokens",
-}
-
-KEYS_TO_REMOVE = "rotary_emb.inv_freq"
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        if KEYS_TO_REMOVE in key:
-            continue
-        model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_persimmon_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
-    import sys
-
-    sys.path.insert(0, ada_lib_path)
-    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
-    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = PersimmonConfig()
-    model = PersimmonForCausalLM(transformers_config, eos_token_id=71013, bos_token_id=71013).to(torch.bfloat16)
-    model.load_state_dict(state_dict)
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Persimmon weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--pt_model_path",
-        help="Location of Persimmon `model_optim_rng.pt`",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--ada_lib_path",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
-
-    convert_persimmon_checkpoint(
-        pytorch_dump_folder_path=args.output_dir,
-        pt_model_path=args.pt_model_path,
-        safe_serialization=args.safe_serialization,
-        ada_lib_path=args.ada_lib_path,
-    )
-    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
-    tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/phi/convert_phi_weights_to_hf.py b/src/transformers/models/phi/convert_phi_weights_to_hf.py
deleted file mode 100644
index 69ef4c5919ed..000000000000
--- a/src/transformers/models/phi/convert_phi_weights_to_hf.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Weights conversion script for Phi
-
-This script downloads both Phi-1 and Phi-1.5 checkpoints to "checkpoint_path" and then converts the weights to
-HugfgingFace model's format and saves them in "pytorch_dump_folder_path".
-
-Example : $python ./convert_phi_weights_to_hf.py --model_name "microsoft/phi-2" --pytorch_dump_folder ./dump_folder/ --checkpoint_path ./ckpt_path/
-"""
-
-import argparse
-import gc
-import os
-
-import safetensors
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import PhiConfig, PhiForCausalLM
-
-
-_MODELS = {
-    "microsoft/phi-1": ["https://huggingface.co/microsoft/phi-1/blob/main/pytorch_model.bin"],
-    "microsoft/phi-1_5": ["https://huggingface.co/microsoft/phi-1_5/blob/main/pytorch_model.bin"],
-    "microsoft/phi-2": [
-        "https://huggingface.co/microsoft/phi-2/blob/main/model-00001-of-00002.safetensors",
-        "https://huggingface.co/microsoft/phi-2/blob/main/model-00002-of-00002.safetensors",
-    ],
-}
-
-PHI_MAPPING = {
-    "transformer.embd.wte.weight": "model.embed_tokens.weight",
-    "lm_head.linear": "lm_head",
-    "lm_head.ln": "model.final_layernorm",
-    "layers": "model.layers",
-    "transformer": "model",
-    ".h.": ".layers.",
-    "ln": "input_layernorm",
-    "mixer": "self_attn",
-    "Wqkv": "query_key_value",
-    "out_proj": "dense",
-}
-
-
-def convert_weights(original_weights, mapping, config):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-
-    for original_weights_key in original_weights_keys:
-        new_key = original_weights_key
-
-        if "rotary_emb" in new_key:
-            continue
-
-        if "Wqkv" in new_key:
-            if "weight" in new_key:
-                weight = original_weights[new_key]
-                weights_shape = weight.shape
-                weight = (
-                    weight.view(3, config.num_attention_heads, -1, config.hidden_size)
-                    .transpose(0, 1)
-                    .reshape(*weights_shape)
-                )
-                original_weights[new_key] = weight
-            elif "bias" in new_key:
-                bias = original_weights[new_key]
-                bias_shape = bias.shape
-                bias = bias.view(3, config.num_attention_heads, -1).transpose(0, 1).reshape(*bias_shape)
-                original_weights[new_key] = bias
-
-        for k, v in mapping.items():
-            if k in new_key:
-                new_key = new_key.replace(k, v)
-
-        converted_weights[new_key] = original_weights.pop(original_weights_key)
-
-    return converted_weights
-
-
-def _download(url: str, root: str):
-    repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
-    filename = f"{url.split('/')[-1]}"
-    hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        force_filename=root,
-        local_dir_use_symlinks=False,
-    )
-
-
-def convert_phi_weights(
-    model_name, checkpoint_path, pytorch_dump_folder_path, use_cuda, save_weights_directly, _MODELS
-):
-    _MODELS = _MODELS if model_name not in _MODELS.keys() else {model_name: _MODELS.get(model_name)}
-    device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
-    for model_name, model_url in _MODELS.items():
-        converted_checkpoint = {}
-        model_checkpoint = {}
-
-        # for phi-2 the weights are stored in 2 different safetensors file so we need to iterate over that list and download one at a time
-        for model_each_url in model_url:
-            model_path = os.path.join(checkpoint_path, model_name + "_" + model_each_url.split("/")[-1])
-            if not os.path.exists(model_path):
-                print(f"\n{model_name} was not found! Downloading it to {model_path}")
-                _download(url=model_each_url, root=model_path)
-
-            if model_path.endswith("safetensors"):
-                loaded_weights = safetensors.torch.load_file(model_path, device=device)
-            else:
-                loaded_weights = torch.load(model_path, map_location=device)
-            model_checkpoint.update(**loaded_weights)
-
-        model_type = model_name.split("/")[1]  # phi-1 or phi-1_5 or phi-2
-
-        # init the config for phi-1 and phi-1.5
-        config = PhiConfig()
-        # if we are dealing with phi-2 then update the config
-        if model_type == "phi-2":
-            config.hidden_size = 2560
-            config.intermediate_size = 10240
-            config.num_hidden_layers = 32
-            config.resid_pdrop = 0.1
-            config.partial_rotary_factor = 0.4
-            config.num_hidden_layers = 32
-            config.torch_dtype = "float16"
-
-        # Converting the weights
-        converted_checkpoint.update(**convert_weights(model_checkpoint, PHI_MAPPING, config))
-
-        # Save either the whole model or the converted weights
-        if save_weights_directly:
-            save_weights_path = os.path.join(pytorch_dump_folder_path, model_type + "_pytorch_model.bin")
-            torch.save(converted_checkpoint, save_weights_path)
-            print(f"Model weights saved at {save_weights_path}!")
-
-        else:
-            model = PhiForCausalLM(config).to(device)
-            model.load_state_dict(converted_checkpoint, strict=True)
-            save_model_path = os.path.join(pytorch_dump_folder_path, model_type)
-            model.save_pretrained(save_model_path)
-            print(f"Model saved at {save_model_path}!")
-
-            # release GPU memory for the 2nd model if cuda was used.
-            del config, model
-
-        # release GPU memory for the 2nd model if cuda was used.
-        del model_checkpoint, converted_checkpoint
-        if use_cuda:
-            torch.cuda.empty_cache()
-        gc.collect()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        help="Name of the model to convert. (Please enter one of the following: phi-1, phi-1_5, phi-2). If nothing is provided, all models will be converted.",
-        default=None,
-    )
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model. (Please enter full path)",
-    )
-    parser.add_argument(
-        "--use_cuda",
-        default=False,
-        type=bool,
-        help="Whether to load the weights on GPU during conversion or not, False by default",
-    )
-    parser.add_argument(
-        "--save_weights_directly",
-        default=True,
-        type=bool,
-        help="Whether to save the weights directly after conversion or load the weight to the Phi model and then save "
-        "the Phi model along with weights. True by default",
-    )
-
-    args = parser.parse_args()
-    convert_phi_weights(
-        args.model_name,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.use_cuda,
-        args.save_weights_directly,
-        _MODELS,
-    )
diff --git a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
deleted file mode 100644
index 457c2236694a..000000000000
--- a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import re
-
-import torch
-from flax.traverse_util import flatten_dict
-from t5x import checkpoints
-
-from transformers import (
-    AutoTokenizer,
-    Pix2StructConfig,
-    Pix2StructForConditionalGeneration,
-    Pix2StructImageProcessor,
-    Pix2StructProcessor,
-    Pix2StructTextConfig,
-    Pix2StructVisionConfig,
-)
-
-
-def get_flax_param(t5x_checkpoint_path):
-    flax_params = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    flax_params = flatten_dict(flax_params)
-    return flax_params
-
-
-def rename_and_convert_flax_params(flax_dict):
-    converted_dict = {}
-
-    CONVERSION_MAPPING = {
-        "token_embedder": "embeddings",
-        "encoder_norm": "layernorm",
-        "kernel": "weight",
-        ".out": ".output",
-        "scale": "weight",
-        "embedders_0.pos_embedding": "row_embedder.weight",
-        "embedders_1.pos_embedding": "column_embedder.weight",
-    }
-
-    DECODER_CONVERSION_MAPPING = {
-        "query": "attention.query",
-        "key": "attention.key",
-        "value": "attention.value",
-        "output.dense": "output",
-        "encoder_decoder_attention.o": "encoder_decoder_attention.attention.o",
-        "pre_self_attention_layer_norm": "self_attention.layer_norm",
-        "pre_cross_attention_layer_norm": "encoder_decoder_attention.layer_norm",
-        "mlp.": "mlp.DenseReluDense.",
-        "pre_mlp_layer_norm": "mlp.layer_norm",
-        "self_attention.o": "self_attention.attention.o",
-        "decoder.embeddings.embedding": "decoder.embed_tokens.weight",
-        "decoder.relpos_bias.rel_embedding": "decoder.layer.0.self_attention.attention.relative_attention_bias.weight",
-        "decoder.decoder_norm.weight": "decoder.final_layer_norm.weight",
-        "decoder.logits_dense.weight": "decoder.lm_head.weight",
-    }
-
-    for key in flax_dict.keys():
-        if "target" in key:
-            # remove the first prefix from the key
-            new_key = ".".join(key[1:])
-
-            # rename the key
-            for old, new in CONVERSION_MAPPING.items():
-                new_key = new_key.replace(old, new)
-
-            if "decoder" in new_key:
-                for old, new in DECODER_CONVERSION_MAPPING.items():
-                    new_key = new_key.replace(old, new)
-
-            if "layers" in new_key and "decoder" not in new_key:
-                # use regex to replace the layer number
-                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
-                new_key = new_key.replace("encoder", "encoder.encoder")
-
-            elif "layers" in new_key and "decoder" in new_key:
-                # use regex to replace the layer number
-                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
-
-            converted_dict[new_key] = flax_dict[key]
-
-    converted_torch_dict = {}
-    # convert converted_dict into torch format
-    for key in converted_dict.keys():
-        if ("embed_tokens" not in key) and ("embedder" not in key):
-            converted_torch_dict[key] = torch.from_numpy(converted_dict[key].T)
-        else:
-            converted_torch_dict[key] = torch.from_numpy(converted_dict[key])
-
-    return converted_torch_dict
-
-
-def convert_pix2struct_original_pytorch_checkpoint_to_hf(
-    t5x_checkpoint_path, pytorch_dump_folder_path, use_large=False, is_vqa=False
-):
-    flax_params = get_flax_param(t5x_checkpoint_path)
-
-    if not use_large:
-        encoder_config = Pix2StructVisionConfig()
-        decoder_config = Pix2StructTextConfig()
-    else:
-        encoder_config = Pix2StructVisionConfig(
-            hidden_size=1536, d_ff=3968, num_attention_heads=24, num_hidden_layers=18
-        )
-        decoder_config = Pix2StructTextConfig(hidden_size=1536, d_ff=3968, num_heads=24, num_layers=18)
-    config = Pix2StructConfig(
-        vision_config=encoder_config.to_dict(), text_config=decoder_config.to_dict(), is_vqa=is_vqa
-    )
-
-    model = Pix2StructForConditionalGeneration(config)
-
-    torch_params = rename_and_convert_flax_params(flax_params)
-    model.load_state_dict(torch_params)
-
-    tok = AutoTokenizer.from_pretrained("ybelkada/test-pix2struct-tokenizer")
-    image_processor = Pix2StructImageProcessor()
-    processor = Pix2StructProcessor(image_processor=image_processor, tokenizer=tok)
-
-    if use_large:
-        processor.image_processor.max_patches = 4096
-
-    processor.image_processor.is_vqa = True
-
-    # mkdir if needed
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    print("Model saved in {}".format(pytorch_dump_folder_path))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--t5x_checkpoint_path", default=None, type=str, help="Path to the original T5x checkpoint.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--use_large", action="store_true", help="Use large model.")
-    parser.add_argument("--is_vqa", action="store_true", help="Use large model.")
-    args = parser.parse_args()
-
-    convert_pix2struct_original_pytorch_checkpoint_to_hf(
-        args.t5x_checkpoint_path, args.pytorch_dump_folder_path, args.use_large
-    )
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
deleted file mode 100644
index a8b3ae5024cc..000000000000
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc. team. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import regex as re
-import torch
-from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-from safetensors.torch import load_file as safe_load_file
-from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
-from tokenizers.models import BPE
-
-from transformers import (
-    LlavaConfig,
-    LlavaForConditionalGeneration,
-    MistralConfig,
-    PixtralImageProcessor,
-    PixtralProcessor,
-    PixtralVisionConfig,
-)
-from transformers.convert_slow_tokenizer import bytes_to_unicode
-
-
-"""
-# Here is how to get the original tokens!
-model_name = "mistralai/Pixtral-12B-2409"
-tok = MistralTokenizer.from_model(model_name)
-
-from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
-
-EXPECTED_TOKENS = tok.encode_chat_completion(
-    ChatCompletionRequest(
-        messages=[
-            UserMessage(
-                content=[
-                    TextChunk(text="Describe the images"),
-                ] + [ImageChunk(image=img) for img in IMG_URLS]
-            )
-        ],
-        model="pixtral",
-    )
-)
-assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
-"""
-
-OLD_KEY_TO_NEW_KEY_MAPPING = {
-    # Layer Normalization Weights
-    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
-    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight",
-    # Self Attention Projections
-    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
-    # MLP Projections
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
-    # Additional mappings
-    r"vision_encoder": r"vision_tower",
-    r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1",
-    r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2",
-    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
-    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
-    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
-    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
-    r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight",
-    r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight",
-    r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight",
-    r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight",
-    r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight",
-    r"output.weight": r"language_model.lm_head.weight",
-    r"norm.weight": r"language_model.model.norm.weight",
-}
-
-
-class MistralConverter:
-    """
-    A general tiktoken converter.
-    """
-
-    def __init__(
-        self,
-        vocab=None,
-        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
-        add_prefix_space=False,
-        additional_special_tokens=None,
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args)
-        self.vocab = vocab
-        self.pattern = pattern
-        self.add_prefix_space = add_prefix_space
-        self.additional_special_tokens = additional_special_tokens
-
-    def extract_vocab_merges_from_model(self, vocab: str):
-        bpe_ranks = vocab
-        byte_encoder = bytes_to_unicode()
-
-        def token_bytes_to_string(b):
-            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
-
-        merges = []
-        vocab = {}
-        for idx, (token, rank) in enumerate(bpe_ranks.items()):
-            if token not in self.additional_special_tokens:
-                vocab[token_bytes_to_string(token)] = idx
-                if len(token) == 1:
-                    continue
-                local = []
-                for index in range(1, len(token)):
-                    piece_l, piece_r = token[:index], token[index:]
-                    if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
-                        local.append((piece_l, piece_r, rank))
-                local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
-                merges.extend(local)
-            else:
-                vocab[token] = idx
-        merges = sorted(merges, key=lambda val: val[2], reverse=False)
-        merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
-        return vocab, merges
-
-    def tokenizer(self):
-        vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
-        tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
-        if hasattr(tokenizer.model, "ignore_merges"):
-            tokenizer.model.ignore_merges = True
-        return tokenizer
-
-    def converted(self) -> Tokenizer:
-        tokenizer = self.tokenizer()
-        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
-            [
-                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
-                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
-            ]
-        )
-        tokenizer.decoder = decoders.ByteLevel()
-        tokenizer.add_special_tokens(self.additional_special_tokens)
-
-        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
-
-        return tokenizer
-
-
-def convert_mistral_tokenizer(model_file):
-    from transformers import LlamaTokenizer
-
-    mistral_tokenizer = MistralTokenizer.from_file(model_file)
-    vocab = mistral_tokenizer.instruct_tokenizer.tokenizer.vocab()
-    control_token_ids = mistral_tokenizer.instruct_tokenizer.tokenizer._control_tokens
-    all_special = [vocab[id] for id in control_token_ids]
-    hf_tokenizer = LlamaTokenizer(model_file)
-    # Do I need to exclude tokens that are already special?
-    hf_tokenizer.add_special_tokens({"additional_special_tokens": all_special})
-    hf_tokenizer.model_input_names = ["input_ids", "attention_mask"]
-    return hf_tokenizer
-
-
-def permute_for_rope(value, n_heads, config):
-    dim1 = value.shape[0]
-    dim2 = config.hidden_size
-    return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-
-def convert_dictionary(original_state_dict, vision_config, text_config):
-    new_dict = {}
-
-    all_keys = "\n" + "\n".join(original_state_dict.keys())
-    old_keys = all_keys
-    for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
-        all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys)
-
-    OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
-
-    for key, value in original_state_dict.items():
-        new_key = OLD_TO_NEW[key]
-        if "vision_encoder" in key:
-            _config = vision_config
-            num_attention_heads = _config.num_attention_heads
-        else:
-            _config = text_config
-            if "q_proj" in new_key:
-                num_attention_heads = _config.num_attention_heads
-            if "k_proj" in new_key:
-                num_attention_heads = _config.num_key_value_heads
-
-        if "q_proj" in new_key or "k_proj" in new_key:
-            value = permute_for_rope(value, num_attention_heads, _config)
-
-        new_dict[new_key] = value
-    return new_dict
-
-
-MISTRAL_CONFIG_MAPPING = {
-    "dim": "hidden_size",
-    "hidden_dim": "intermediate_size",
-    "n_kv_heads": "num_key_value_heads",
-    "n_heads": "num_attention_heads",
-    "n_layers": "num_hidden_layers",
-}
-
-
-def convert_mistral_model(input_dir, output_dir):
-    vision_config = {}
-    if os.path.isfile(f"{input_dir}/params.json"):
-        with open(f"{input_dir}/params.json") as f:
-            param_json = json.load(f)
-        vision_config = param_json.pop("vision_encoder")
-        for k, v in MISTRAL_CONFIG_MAPPING.items():
-            value = param_json.pop(k)
-            param_json[v] = value
-        if "hidden_act" not in vision_config:
-            vision_config["hidden_act"] = "silu"
-        text_config = MistralConfig(
-            **param_json,
-            hidden_act="silu",
-            sliding_window=None,
-            tie_word_embeddings=False,
-            is_composition=True,
-            rms_norm_eps=1e-5,
-        )
-    else:
-        text_config = MistralConfig(
-            attention_dropout=0.0,
-            bos_token_id=1,
-            eos_token_id=2,
-            head_dim=128,
-            hidden_act="silu",
-            hidden_size=5120,
-            initializer_range=0.02,
-            intermediate_size=14336,
-            max_position_embeddings=1024000,
-            model_type="mistral",
-            num_attention_heads=32,
-            num_hidden_layers=40,
-            num_key_value_heads=8,
-            rms_norm_eps=1e-05,
-            rope_theta=1000000000.0,
-            sliding_window=None,
-            tie_word_embeddings=False,
-            vocab_size=131072,
-        )
-    adapter_bias = vision_config.pop("adapter_bias", True)
-    vision_config = PixtralVisionConfig(**vision_config)
-    config = LlavaConfig(
-        vision_config,
-        text_config,
-        vision_feature_layer=-1,
-        image_token_index=10,
-        vision_feature_select_strategy="full",
-        image_seq_length=1,
-        multimodal_projector_bias=adapter_bias,
-    )
-    config.architectures = ["LlavaForConditionalGeneration"]
-    config.save_pretrained(output_dir)
-    full_original_state_dict = {}
-    safetensors_files = sorted([file for file in os.listdir(input_dir) if file.endswith(".safetensors")])
-    if len(safetensors_files) == 1:
-        full_original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
-    else:
-        for file in safetensors_files:
-            loaded_dict = safe_load_file(f"{input_dir}/{file}")
-            full_original_state_dict.update(loaded_dict)
-
-    new_dict = convert_dictionary(full_original_state_dict, vision_config, text_config)
-    with torch.device("meta"):
-        model = LlavaForConditionalGeneration(config)
-    model.load_state_dict(new_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-        required=True,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_file", help="Location of the specific tokenizer model file to use.", required=True
-    )
-    parser.add_argument(
-        "--chat_template_file",
-        help="Optional file containing a raw chat template. Will be set as the processor's chat template.",
-        required=False,
-    )
-
-    args = parser.parse_args()
-    convert_mistral_model(args.input_dir, args.output_dir)
-    tokenizer = convert_mistral_tokenizer(args.tokenizer_file)
-    image_processor = PixtralImageProcessor()
-    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    if args.chat_template_file:
-        processor.chat_template = open(args.chat_template_file).read()
-    processor.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py b/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
deleted file mode 100644
index eac4a27d11c5..000000000000
--- a/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import PLBartConfig, PLBartForConditionalGeneration, PLBartForSequenceClassification
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-        "decoder.output_projection.weight",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_plbart_checkpoint_from_disk(
-    checkpoint_path, hf_config_path="uclanlp/plbart-base", finetuned=False, classification=False
-):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    plbart_config = PLBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    if not classification:
-        model = PLBartForConditionalGeneration(plbart_config)
-        model.model.load_state_dict(state_dict)
-        if finetuned:
-            model.lm_head = make_linear_from_emb(model.model.shared)
-
-    else:
-        classification_head = {}
-        for key, value in state_dict.copy().items():
-            if key.startswith("classification_heads.sentence_classification_head"):
-                classification_head[key.replace("classification_heads.sentence_classification_head.", "")] = value
-                state_dict.pop(key)
-        model = PLBartForSequenceClassification(plbart_config)
-        model.model.load_state_dict(state_dict)
-        model.classification_head.load_state_dict(classification_head)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config",
-        default="uclanlp/plbart-base",
-        type=str,
-        help="Which huggingface architecture to use: plbart-base",
-    )
-    parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint")
-    parser.add_argument(
-        "--classification", action="store_true", help="whether the model is a classification checkpoint"
-    )
-    args = parser.parse_args()
-    model = convert_fairseq_plbart_checkpoint_from_disk(
-        args.fairseq_path,
-        hf_config_path=args.hf_config,
-        finetuned=args.finetuned,
-        classification=args.classification,
-    )
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
deleted file mode 100644
index e5fad6da1a3f..000000000000
--- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PoolFormer checkpoints from the original repository. URL: https://github.com/sail-sg/poolformer"""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import PoolFormerConfig, PoolFormerForImageClassification, PoolFormerImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def replace_key_with_offset(key, offset, original_name, new_name):
-    """
-    Replaces the key by subtracting the offset from the original layer number
-    """
-    to_find = original_name.split(".")[0]
-    key_list = key.split(".")
-    orig_block_num = int(key_list[key_list.index(to_find) - 2])
-    layer_num = int(key_list[key_list.index(to_find) - 1])
-    new_block_num = orig_block_num - offset
-
-    key = key.replace(f"{orig_block_num}.{layer_num}.{original_name}", f"block.{new_block_num}.{layer_num}.{new_name}")
-    return key
-
-
-def rename_keys(state_dict):
-    new_state_dict = OrderedDict()
-    total_embed_found, patch_emb_offset = 0, 0
-    for key, value in state_dict.items():
-        if key.startswith("network"):
-            key = key.replace("network", "poolformer.encoder")
-        if "proj" in key:
-            # Works for the first embedding as well as the internal embedding layers
-            if key.endswith("bias") and "patch_embed" not in key:
-                patch_emb_offset += 1
-            to_replace = key[: key.find("proj")]
-            key = key.replace(to_replace, f"patch_embeddings.{total_embed_found}.")
-            key = key.replace("proj", "projection")
-            if key.endswith("bias"):
-                total_embed_found += 1
-        if "patch_embeddings" in key:
-            key = "poolformer.encoder." + key
-        if "mlp.fc1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc1", "output.conv1")
-        if "mlp.fc2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc2", "output.conv2")
-        if "norm1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "norm1", "before_norm")
-        if "norm2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "norm2", "after_norm")
-        if "layer_scale_1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_1", "layer_scale_1")
-        if "layer_scale_2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_2", "layer_scale_2")
-        if "head" in key:
-            key = key.replace("head", "classifier")
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our PoolFormer structure.
-    """
-
-    # load default PoolFormer configuration
-    config = PoolFormerConfig()
-
-    # set attributes based on model_name
-    repo_id = "huggingface/label-files"
-    size = model_name[-3:]
-    config.num_labels = 1000
-    filename = "imagenet-1k-id2label.json"
-    expected_shape = (1, 1000)
-
-    # set config attributes
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    if size == "s12":
-        config.depths = [2, 2, 6, 2]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        crop_pct = 0.9
-    elif size == "s24":
-        config.depths = [4, 4, 12, 4]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        crop_pct = 0.9
-    elif size == "s36":
-        config.depths = [6, 6, 18, 6]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.9
-    elif size == "m36":
-        config.depths = [6, 6, 18, 6]
-        config.hidden_sizes = [96, 192, 384, 768]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.95
-    elif size == "m48":
-        config.depths = [8, 8, 24, 8]
-        config.hidden_sizes = [96, 192, 384, 768]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.95
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # load image processor
-    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
-
-    # Prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-
-    # rename keys
-    state_dict = rename_keys(state_dict)
-
-    # create HuggingFace model and load state dict
-    model = PoolFormerForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Define image processor
-    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
-    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
-
-    # forward pass
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # define expected logit slices for different models
-    if size == "s12":
-        expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869])
-    elif size == "s24":
-        expected_slice = torch.tensor([0.4402, -0.1374, -0.8045])
-    elif size == "s36":
-        expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898])
-    elif size == "m36":
-        expected_slice = torch.tensor([0.3952, 0.2263, -1.2668])
-    elif size == "m48":
-        expected_slice = torch.tensor([0.1167, -0.0656, -0.3423])
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
-
-    # finally, save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="poolformer_s12",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_poolformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
deleted file mode 100644
index 54b8bb67e60a..000000000000
--- a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was
-constructed"""
-
-import json
-
-import torch
-
-from transformers import Pop2PianoConfig, Pop2PianoForConditionalGeneration
-
-
-########################## MODEL WEIGHTS ##########################
-
-# This weights were downloaded from the official pop2piano repository
-# https://huggingface.co/sweetcocoa/pop2piano/blob/main/model-1999-val_0.67311615.ckpt
-official_weights = torch.load("./model-1999-val_0.67311615.ckpt")
-state_dict = {}
-
-
-# load the config and init the model
-cfg = Pop2PianoConfig.from_pretrained("sweetcocoa/pop2piano")
-model = Pop2PianoForConditionalGeneration(cfg)
-
-
-# load relative attention bias
-state_dict["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
-    "transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-]
-state_dict["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
-    "transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-]
-
-# load embed tokens and final layer norm for both encoder and decoder
-state_dict["encoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.encoder.embed_tokens.weight"]
-state_dict["decoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.decoder.embed_tokens.weight"]
-
-state_dict["encoder.final_layer_norm.weight"] = official_weights["state_dict"][
-    "transformer.encoder.final_layer_norm.weight"
-]
-state_dict["decoder.final_layer_norm.weight"] = official_weights["state_dict"][
-    "transformer.decoder.final_layer_norm.weight"
-]
-
-# load lm_head, mel_conditioner.emb and shared
-state_dict["lm_head.weight"] = official_weights["state_dict"]["transformer.lm_head.weight"]
-state_dict["mel_conditioner.embedding.weight"] = official_weights["state_dict"]["mel_conditioner.embedding.weight"]
-state_dict["shared.weight"] = official_weights["state_dict"]["transformer.shared.weight"]
-
-# load each encoder blocks
-for i in range(cfg.num_layers):
-    # layer 0
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.q.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.k.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.v.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.o.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.layer_norm.weight"
-    ]
-
-    # layer 1
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wo.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.layer_norm.weight"
-    ]
-
-# load each decoder blocks
-for i in range(6):
-    # layer 0
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.q.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.k.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.v.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.o.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.layer_norm.weight"
-    ]
-
-    # layer 1
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.q.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.k.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.v.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.o.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.layer_norm.weight"
-    ]
-
-    # layer 2
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wo.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.layer_norm.weight"
-    ]
-
-model.load_state_dict(state_dict, strict=True)
-
-# save the weights
-torch.save(state_dict, "./pytorch_model.bin")
-
-########################## TOKENIZER ##########################
-
-# the tokenize and detokenize methods are taken from the official implementation
-
-
-# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L34
-def tokenize(idx, token_type, n_special=4, n_note=128, n_velocity=2):
-    if token_type == "TOKEN_TIME":
-        return n_special + n_note + n_velocity + idx
-    elif token_type == "TOKEN_VELOCITY":
-        return n_special + n_note + idx
-    elif token_type == "TOKEN_NOTE":
-        return n_special + idx
-    elif token_type == "TOKEN_SPECIAL":
-        return idx
-    else:
-        return -1
-
-
-# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L48
-def detokenize(idx, n_special=4, n_note=128, n_velocity=2, time_idx_offset=0):
-    if idx >= n_special + n_note + n_velocity:
-        return "TOKEN_TIME", (idx - (n_special + n_note + n_velocity)) + time_idx_offset
-    elif idx >= n_special + n_note:
-        return "TOKEN_VELOCITY", idx - (n_special + n_note)
-    elif idx >= n_special:
-        return "TOKEN_NOTE", idx - n_special
-    else:
-        return "TOKEN_SPECIAL", idx
-
-
-# crate the decoder and then the encoder of the tokenizer
-decoder = {}
-for i in range(cfg.vocab_size):
-    decoder.update({i: f"{detokenize(i)[1]}_{detokenize(i)[0]}"})
-
-encoder = {v: k for k, v in decoder.items()}
-
-# save the vocab
-with open("./vocab.json", "w") as file:
-    file.write(json.dumps(encoder))
diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 30390561169e..000000000000
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ProphetNet checkpoint."""
-
-import argparse
-
-from torch import nn
-
-# transformers_old should correspond to branch `save_old_prophetnet_model_structure` here
-# original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively
-from transformers_old.modeling_prophetnet import (
-    ProphetNetForConditionalGeneration as ProphetNetForConditionalGenerationOld,
-)
-from transformers_old.modeling_xlm_prophetnet import (
-    XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
-)
-
-from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
-
-
-logger = logging.get_logger(__name__)
-logging.set_verbosity_info()
-
-
-def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
-    """
-    Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
-    """
-    if "xprophetnet" in prophetnet_checkpoint_path:
-        prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
-        prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
-            prophetnet_checkpoint_path, output_loading_info=True
-        )
-    else:
-        prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
-        prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
-            prophetnet_checkpoint_path, output_loading_info=True
-        )
-
-    special_keys = ["key_proj", "value_proj", "query_proj"]
-
-    mapping = {
-        "self_attn": "ngram_self_attn",
-        "cross_attn": "encoder_attn",
-        "cross_attn_layer_norm": "encoder_attn_layer_norm",
-        "feed_forward_layer_norm": "final_layer_norm",
-        "feed_forward": "",
-        "intermediate": "fc1",
-        "output": "fc2",
-        "key_proj": "k_proj",
-        "query_proj": "q_proj",
-        "value_proj": "v_proj",
-        "word_embeddings": "embed_tokens",
-        "embeddings_layer_norm": "emb_layer_norm",
-        "relative_pos_embeddings": "relative_linear",
-        "ngram_embeddings": "ngram_input_embed",
-        "position_embeddings": "embed_positions",
-    }
-
-    for key in loading_info["missing_keys"]:
-        attributes = key.split(".")
-
-        if attributes[0] == "lm_head":
-            model = prophet
-            old_model = prophet_old
-        else:
-            model = prophet.prophetnet
-            old_model = prophet_old.model
-
-        is_key_init = False
-        for attribute in attributes:
-            if attribute in mapping:
-                old_attribute = mapping[attribute]
-                if not hasattr(old_model, old_attribute) and len(old_attribute) > 0:
-                    old_attribute = attribute
-            elif hasattr(old_model, attribute):
-                old_attribute = attribute
-
-            if attribute == "weight":
-                assert old_model.weight.shape == model.weight.shape, "Shapes have to match!"
-                model.weight = old_model.weight
-                logger.info(f"{attribute} is initialized.")
-                is_key_init = True
-                break
-            elif attribute == "bias":
-                assert old_model.bias.shape == model.bias.shape, "Shapes have to match!"
-                model.bias = old_model.bias
-                logger.info(f"{attribute} is initialized")
-                is_key_init = True
-                break
-            elif attribute in special_keys and hasattr(old_model, "in_proj_weight"):
-                embed_dim = old_model.in_proj_weight.shape[0] // 3
-                param = getattr(model, attribute)
-                param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match"
-                param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match"
-                if attribute == "query_proj":
-                    model.query_proj.weight = nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
-                    model.query_proj.bias = nn.Parameter(old_model.in_proj_bias[:embed_dim])
-
-                elif attribute == "key_proj":
-                    model.key_proj.weight = nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
-                    model.key_proj.bias = nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
-                elif attribute == "value_proj":
-                    model.value_proj.weight = nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
-                    model.value_proj.bias = nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
-                is_key_init = True
-                break
-            elif attribute == "position_embeddings":
-                assert (
-                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
-                ), "Hidden size has to match"
-                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
-                model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
-                is_key_init = True
-                break
-
-            if attribute.isdigit():
-                model = model[int(attribute)]
-                old_model = old_model[int(old_attribute)]
-            else:
-                model = getattr(model, attribute)
-
-                if old_attribute == "":
-                    old_model = old_model
-                else:
-                    if not hasattr(old_model, old_attribute):
-                        raise ValueError(f"{old_model} does not have {old_attribute}")
-                    old_model = getattr(old_model, old_attribute)
-
-        if not is_key_init:
-            raise ValueError(f"{key} was not correctly initialized!")
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    prophet.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--prophetnet_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_prophetnet_checkpoint_to_pytorch(args.prophetnet_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
deleted file mode 100644
index 73ae4c157187..000000000000
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
-# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Pvt checkpoints from the original library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
-        rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
-
-        rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
-        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt.encoder.patch_embeddings.{i}.projection.bias"))
-        rename_keys.append((f"patch_embed{i + 1}.norm.weight", f"pvt.encoder.patch_embeddings.{i}.layer_norm.weight"))
-        rename_keys.append((f"patch_embed{i + 1}.norm.bias", f"pvt.encoder.patch_embeddings.{i}.layer_norm.bias"))
-
-        for j in range(config.depths[i]):
-            # Rename blocks' parameters
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.weight", f"pvt.encoder.block.{i}.{j}.attention.self.query.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.bias", f"pvt.encoder.block.{i}.{j}.attention.self.query.bias")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
-            )
-            rename_keys.append((f"block{i + 1}.{j}.attn.kv.bias", f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias"))
-
-            if config.sequence_reduction_ratios[i] > 1:
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.weight",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.weight",
-                    )
-                )
-                rename_keys.append(
-                    (f"block{i + 1}.{j}.attn.norm.bias", f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.bias")
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.weight",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.bias",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.bias",
-                    )
-                )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt.encoder.block.{i}.{j}.attention.output.dense.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt.encoder.block.{i}.{j}.attention.output.dense.bias")
-            )
-
-            rename_keys.append((f"block{i + 1}.{j}.norm1.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_1.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.norm1.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_1.bias"))
-
-            rename_keys.append((f"block{i + 1}.{j}.norm2.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_2.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.norm2.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_2.bias"))
-
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense1.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense1.bias"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense2.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense2.bias"))
-
-    # Rename cls token
-    rename_keys.extend(
-        [
-            ("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
-        ]
-    )
-    # Rename norm layer and classifier layer
-    rename_keys.extend(
-        [
-            ("norm.weight", "pvt.encoder.layer_norm.weight"),
-            ("norm.bias", "pvt.encoder.layer_norm.bias"),
-            ("head.weight", "classifier.weight"),
-            ("head.bias", "classifier.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our PVT structure.
-    """
-
-    # define default Pvt configuration
-    if pvt_size == "tiny":
-        config_path = "Zetatech/pvt-tiny-224"
-    elif pvt_size == "small":
-        config_path = "Zetatech/pvt-small-224"
-    elif pvt_size == "medium":
-        config_path = "Zetatech/pvt-medium-224"
-    elif pvt_size == "large":
-        config_path = "Zetatech/pvt-large-224"
-    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
-    config = PvtConfig(name_or_path=config_path)
-    # load original model from https://github.com/whai362/PVT
-    state_dict = torch.load(pvt_checkpoint, map_location="cpu")
-
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = PvtForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by PVTFeatureExtractor
-    image_processor = PvtImageProcessor(size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-    logits = outputs.logits.detach().cpu()
-
-    if pvt_size == "tiny":
-        expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
-    elif pvt_size == "small":
-        expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
-    elif pvt_size == "medium":
-        expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
-    elif pvt_size == "large":
-        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
-    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
-
-    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pvt_size",
-        default="tiny",
-        type=str,
-        help="Size of the PVT pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pvt_checkpoint",
-        default="pvt_tiny.pth",
-        type=str,
-        help="Checkpoint of the PVT pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py b/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
deleted file mode 100644
index e397cb244c0e..000000000000
--- a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
-# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PvtV2 checkpoints from the original library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import PvtImageProcessor, PvtV2Config, PvtV2ForImageClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
-        rename_keys.append(
-            (f"patch_embed{i + 1}.proj.weight", f"pvt_v2.encoder.layers.{i}.patch_embedding.proj.weight")
-        )
-        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt_v2.encoder.layers.{i}.patch_embedding.proj.bias"))
-        rename_keys.append(
-            (f"patch_embed{i + 1}.norm.weight", f"pvt_v2.encoder.layers.{i}.patch_embedding.layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"patch_embed{i + 1}.norm.bias", f"pvt_v2.encoder.layers.{i}.patch_embedding.layer_norm.bias")
-        )
-        rename_keys.append((f"norm{i + 1}.weight", f"pvt_v2.encoder.layers.{i}.layer_norm.weight"))
-        rename_keys.append((f"norm{i + 1}.bias", f"pvt_v2.encoder.layers.{i}.layer_norm.bias"))
-
-        for j in range(config.depths[i]):
-            # Rename blocks' parameters
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.query.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.query.bias")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
-            )
-
-            if config.linear_attention or config.sr_ratios[i] > 1:
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.weight",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.layer_norm.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.bias",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.layer_norm.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.weight",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.spatial_reduction.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.bias",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.spatial_reduction.bias",
-                    )
-                )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.proj.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.proj.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm1.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_1.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm1.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_1.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm2.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_2.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm2.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_2.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense1.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense1.bias")
-            )
-            rename_keys.append(
-                (
-                    f"block{i + 1}.{j}.mlp.dwconv.dwconv.weight",
-                    f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dwconv.dwconv.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"block{i + 1}.{j}.mlp.dwconv.dwconv.bias",
-                    f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dwconv.dwconv.bias",
-                )
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense2.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense2.bias")
-            )
-
-    rename_keys.extend(
-        [
-            ("head.weight", "classifier.weight"),
-            ("head.bias", "classifier.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
-            kv_bias = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.bias"] = kv_bias[
-                config.hidden_sizes[i] :
-            ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folder_path, verify_imagenet_weights=False):
-    """
-    Copy/paste/tweak model's weights to our PVT structure.
-    """
-
-    # define default PvtV2 configuration
-    if pvt_v2_size == "b0":
-        config_path = "OpenGVLab/pvt_v2_b0"
-    elif pvt_v2_size == "b1":
-        config_path = "OpenGVLab/pvt_v2_b1"
-    elif pvt_v2_size == "b2":
-        config_path = "OpenGVLab/pvt_v2_b2"
-    elif pvt_v2_size == "b2-linear":
-        config_path = "OpenGVLab/pvt_v2_b2_linear"
-    elif pvt_v2_size == "b3":
-        config_path = "OpenGVLab/pvt_v2_b3"
-    elif pvt_v2_size == "b4":
-        config_path = "OpenGVLab/pvt_v2_b4"
-    elif pvt_v2_size == "b5":
-        config_path = "OpenGVLab/pvt_v2_b5"
-    else:
-        raise ValueError(
-            f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
-            f"'{pvt_v2_size}' was given"
-        )
-    config = PvtV2Config.from_pretrained(config_path)
-    # load original model from https://github.com/whai362/PVT
-    state_dict = torch.load(pvt_v2_checkpoint, map_location="cpu")
-
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = PvtV2ForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-    image_processor = PvtImageProcessor(size=config.image_size)
-
-    if verify_imagenet_weights:
-        # Check outputs on an image, prepared by PvtImageProcessor
-        print("Verifying conversion of pretrained ImageNet weights...")
-        encoding = image_processor(images=prepare_img(), return_tensors="pt")
-        pixel_values = encoding["pixel_values"]
-        outputs = model(pixel_values)
-        logits = outputs.logits.detach().cpu()
-
-        if pvt_v2_size == "b0":
-            expected_slice_logits = torch.tensor([-1.1939, -1.4547, -0.1076])
-        elif pvt_v2_size == "b1":
-            expected_slice_logits = torch.tensor([-0.4716, -0.7335, -0.4600])
-        elif pvt_v2_size == "b2":
-            expected_slice_logits = torch.tensor([0.0795, -0.3170, 0.2247])
-        elif pvt_v2_size == "b2-linear":
-            expected_slice_logits = torch.tensor([0.0968, 0.3937, -0.4252])
-        elif pvt_v2_size == "b3":
-            expected_slice_logits = torch.tensor([-0.4595, -0.2870, 0.0940])
-        elif pvt_v2_size == "b4":
-            expected_slice_logits = torch.tensor([-0.1769, -0.1747, -0.0143])
-        elif pvt_v2_size == "b5":
-            expected_slice_logits = torch.tensor([-0.2943, -0.1008, 0.6812])
-        else:
-            raise ValueError(
-                f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
-                f"'{pvt_v2_size}' was given"
-            )
-
-        assert torch.allclose(
-            logits[0, :3], expected_slice_logits, atol=1e-4
-        ), "ImageNet weights not converted successfully."
-
-        print("ImageNet weights verified, conversion successful.")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pvt_v2_size",
-        default="b0",
-        type=str,
-        help="Size of the PVTv2 pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pvt_v2_checkpoint",
-        default="pvt_v2_b0.pth",
-        type=str,
-        help="Checkpoint of the PVTv2 pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify-imagenet-weights",
-        action="store_true",
-        default=False,
-        help="Verifies the correct conversion of author-published pretrained ImageNet weights.",
-    )
-
-    args = parser.parse_args()
-    convert_pvt_v2_checkpoint(
-        pvt_v2_size=args.pvt_v2_size,
-        pvt_v2_checkpoint=args.pvt_v2_checkpoint,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        verify_imagenet_weights=args.verify_imagenet_weights,
-    )
diff --git a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
deleted file mode 100644
index dc6619e217e4..000000000000
--- a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import GemmaTokenizer, RecurrentGemmaConfig, RecurrentGemmaForCausalLM
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-import regex as re
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import GemmaForCausalLM, GemmaTokenizerFast
-
-model = GemmaForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_2b_config = RecurrentGemmaConfig(
-    num_attention_heads=10,
-    num_key_value_heads=1,
-    hidden_size=2560,
-    intermediate_size=15360,
-    vocab_size=256000,
-    num_hidden_layers=26,
-)
-
-gemma_7b_config = RecurrentGemmaConfig()
-
-CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-    model_state_dict = torch.load(input_base_path, map_location="cpu")
-
-    REPLACEMENT = {
-        "blocks.": "layers.",
-        ".ffw_down.b": ".down_proj.b",
-        ".ffw_down.w": ".down_proj.w",
-        ".ffw_up.b": ".up_proj.bias",
-        ".ffw_up.w": ".up_proj.weight",
-        "recurrent_block": "temporal_block",
-        "attention_block": "temporal_block",
-        "temporal_block.proj_final": "temporal_block.out_proj",
-        "norm.scale": "norm.weight",
-        ".proj_k": ".k_proj",
-        ".proj_q": ".q_proj",
-        ".proj_v": ".v_proj",
-        ".proj_final": ".o_proj",
-        "embedder.input_embedding": "embed_tokens.weight",
-        "conv_1d.w": "conv_1d.weight",
-        "conv_1d.b": "conv_1d.bias",
-        "input_gate.w": "input_gate.weight",
-        "input_gate.b": "input_gate.bias",
-        "a_param": "recurrent_param",
-        "a_gate.b": "recurrent_gate.bias",
-        "a_gate.w": "recurrent_gate.weight",
-    }
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        k = "model." + k
-        pattern = re.compile("|".join(map(re.escape, REPLACEMENT.keys())))
-        key = pattern.sub(lambda match: REPLACEMENT[match.group(0)], k)
-        if "conv_1d.weight" in key:
-            v = v[:, None, :].transpose(0, 2)
-        if "up_proj.weight" in key:
-            state_dict[key.replace("up_proj", "gate_proj")] = v[0].T.contiguous()
-            v = v[1].T.contiguous()
-        if "up_proj.bias" in key:
-            state_dict[key.replace("up_proj", "gate_proj")] = v[0, 0, 0].clone()
-            v = v[1, 0, 0].contiguous()
-        if "recurrent_gate.bias" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "recurrent_gate.weight" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "input_gate.b" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "input_gate.w" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "embed_tokens" in key:
-            state_dict[key] = v[: config.vocab_size, :].contiguous().clone()
-            state_dict["lm_head.weight"] = v[: config.vocab_size, :].contiguous().clone()
-        else:
-            state_dict[key] = v.contiguous()
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma model.")
-    with init_empty_weights():
-        model = RecurrentGemmaForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=True)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma weights.",
-        default="/home/arthur/transformers_recurrentgemma/google/recurrent-gemma-2b-it/ToBeDeleted/2b-it.pt",
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="2B",
-        choices=["2B", "7B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/recurrent-gemma-2b-it-hf",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-
-    config = CONFIG_MAPPING[args.model_size]
-    dtype = getattr(torch, args.dtype)
-    write_model(
-        config=config,
-        input_base_path=args.input_checkpoint,
-        save_path=args.output_dir,
-        safe_serialization=not args.pickle_serialization,
-        push_to_hub=args.push_to_hub,
-        dtype=dtype,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
deleted file mode 100755
index 7e287a47bfed..000000000000
--- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Reformer checkpoint."""
-
-import argparse
-import pickle
-
-import numpy as np
-import torch
-from torch import nn
-
-from transformers import ReformerConfig, ReformerModelWithLMHead
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def set_param(torch_layer, weight, bias=None):
-    # set parameter of one layer
-    assert torch_layer.weight.shape == weight.shape, f"{torch_layer} layer.weight does not match"
-    torch_layer.weight = nn.Parameter(weight)
-    if bias is not None:
-        assert torch_layer.bias.shape == bias.shape, f"{torch_layer} layer.bias does not match"
-        torch_layer.bias = nn.Parameter(bias)
-
-
-def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
-    # set torch weights for 1-to-1 comparison
-    np_query_key = np.asarray(weights[0])
-    np_value = np.asarray(weights[1])
-    np_dense = np.asarray(weights[2])
-
-    set_param(
-        torch_layer.self_attention.query_key,
-        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.value,
-        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.output.dense,
-        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
-    )
-
-
-def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
-    # set torch weights for 1-to-1 comparison
-    np_query = np.asarray(weights[0])
-    np_key = np.asarray(weights[1])
-    np_value = np.asarray(weights[2])
-    np_dense = np.asarray(weights[3])
-
-    set_param(
-        torch_layer.self_attention.query,
-        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.key,
-        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.value,
-        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.output.dense,
-        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
-    )
-
-
-def set_block_weights_in_torch(weights, torch_block, hidden_size):
-    # layernorm 1
-    layer_norm_1 = weights[0][0][0]
-    layer_norm_1_weight = np.asarray(layer_norm_1[0])
-    layer_norm_1_bias = np.asarray(layer_norm_1[1])
-    set_param(
-        torch_block.attention.layer_norm,
-        torch.tensor(layer_norm_1_weight),
-        torch.tensor(layer_norm_1_bias),
-    )
-
-    # lsh weights + output
-    attn_weights = weights[0][1]
-    if len(attn_weights) < 4:
-        set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size)
-    else:
-        set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size)
-
-    # intermediate weighs
-    intermediate_weights = weights[2][0][1][2]
-
-    # Chunked Feed Forward
-    if len(intermediate_weights) == 4:
-        intermediate_weights = intermediate_weights[2]
-
-    # layernorm 2
-    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
-    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
-    set_param(
-        torch_block.feed_forward.layer_norm,
-        torch.tensor(layer_norm_2_weight),
-        torch.tensor(layer_norm_2_bias),
-    )
-
-    # intermediate dense
-    inter_dense_weight = np.asarray(intermediate_weights[1][0])
-    inter_dense_bias = np.asarray(intermediate_weights[1][1])
-    set_param(
-        torch_block.feed_forward.dense.dense,
-        torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(),
-        torch.tensor(inter_dense_bias),
-    )
-
-    # intermediate out
-    out_dense_weight = np.asarray(intermediate_weights[4][0])
-    out_dense_bias = np.asarray(intermediate_weights[4][1])
-    set_param(
-        torch_block.feed_forward.output.dense,
-        torch.tensor(out_dense_weight).transpose(0, 1).contiguous(),
-        torch.tensor(out_dense_bias),
-    )
-
-
-def set_model_weights_in_torch(weights, torch_model, hidden_size):
-    # reformer model
-    torch_model_reformer = torch_model.reformer
-
-    # word embeds
-    word_embeddings = np.asarray(weights[1])
-    set_param(
-        torch_model_reformer.embeddings.word_embeddings,
-        torch.tensor(word_embeddings),
-    )
-
-    if isinstance(weights[3], tuple):
-        position_embeddings = torch_model_reformer.embeddings.position_embeddings
-        for emb_idx in range(len(position_embeddings.weights)):
-            emb_weights = np.asarray(weights[3][emb_idx][0])
-            assert (
-                position_embeddings.weights[emb_idx].shape == emb_weights.shape
-            ), f"{position_embeddings[emb_idx]} emb does not match"
-            position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights))
-
-    trax_layer_weights = weights[5]
-    assert len(torch_model_reformer.encoder.layers) * 4 == len(
-        trax_layer_weights
-    ), "HF and trax model do not have the same number of layers"
-    for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
-        block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
-        set_block_weights_in_torch(block_weights, layer, hidden_size)
-
-    # output layer norm
-    layer_norm_out_weight = np.asarray(weights[7][0])
-    layer_norm_out_bias = np.asarray(weights[7][1])
-    set_param(
-        torch_model_reformer.encoder.layer_norm,
-        torch.tensor(layer_norm_out_weight),
-        torch.tensor(layer_norm_out_bias),
-    )
-
-    # output embeddings
-    output_embed_weights = np.asarray(weights[9][0])
-    output_embed_bias = np.asarray(weights[9][1])
-    set_param(
-        torch_model.lm_head.decoder,
-        torch.tensor(output_embed_weights).transpose(0, 1).contiguous(),
-        torch.tensor(output_embed_bias),
-    )
-
-
-def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = ReformerConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = ReformerModelWithLMHead(config)
-
-    with open(trax_model_pkl_path, "rb") as f:
-        model_weights = pickle.load(f)["weights"]
-
-    set_model_weights_in_torch(model_weights, model, config.hidden_size)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--trax_model_pkl_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained Reformer model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
deleted file mode 100644
index a06b2e830de0..000000000000
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RegNet 10B checkpoints vissl."""
-# You need to install a specific version of classy vision
-# pip install git+https://github.com/FrancescoSaverioZuppichini/ClassyVision.git@convert_weights
-
-import argparse
-import json
-import os
-import re
-from collections import OrderedDict
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from pprint import pprint
-from typing import Dict, List, Tuple
-
-import torch
-import torch.nn as nn
-from classy_vision.models.regnet import RegNet, RegNetParams
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-from vissl.models.model_helpers import get_trunk_forward_outputs
-
-from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-    name2module: Dict[str, nn.Module] = field(default_factory=OrderedDict)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor, name: str):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-            self.name2module[name] = m
-
-    def __call__(self, x: Tensor):
-        for name, m in self.module.named_modules():
-            self.handles.append(m.register_forward_hook(partial(self._forward_hook, name=name)))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return {k: v for k, v in self.name2module.items() if len(list(v.state_dict().keys())) > 0}
-
-
-class FakeRegNetVisslWrapper(nn.Module):
-    """
-    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
-    """
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-
-        feature_blocks: List[Tuple[str, nn.Module]] = []
-        # - get the stem
-        feature_blocks.append(("conv1", model.stem))
-        # - get all the feature blocks
-        for k, v in model.trunk_output.named_children():
-            assert k.startswith("block"), f"Unexpected layer name {k}"
-            block_index = len(feature_blocks) + 1
-            feature_blocks.append((f"res{block_index}", v))
-
-        self._feature_blocks = nn.ModuleDict(feature_blocks)
-
-    def forward(self, x: Tensor):
-        return get_trunk_forward_outputs(
-            x,
-            out_feat_keys=None,
-            feature_blocks=self._feature_blocks,
-        )
-
-
-class FakeRegNetParams(RegNetParams):
-    """
-    Used to instantiace a RegNet model from classy vision with the same depth as the 10B one but with super small
-    parameters, so we can trace it in memory.
-    """
-
-    def get_expanded_params(self):
-        return [(8, 2, 2, 8, 1.0), (8, 2, 7, 8, 1.0), (8, 2, 17, 8, 1.0), (8, 2, 1, 8, 1.0)]
-
-
-def get_from_to_our_keys(model_name: str) -> Dict[str, str]:
-    """
-    Returns a dictionary that maps from original model's key -> our implementation's keys
-    """
-
-    # create our model (with small weights)
-    our_config = RegNetConfig(depths=[2, 7, 17, 1], hidden_sizes=[8, 8, 8, 8], groups_width=8)
-    if "in1k" in model_name:
-        our_model = RegNetForImageClassification(our_config)
-    else:
-        our_model = RegNetModel(our_config)
-    # create from model (with small weights)
-    from_model = FakeRegNetVisslWrapper(
-        RegNet(FakeRegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-    )
-
-    with torch.no_grad():
-        from_model = from_model.eval()
-        our_model = our_model.eval()
-
-        x = torch.randn((1, 3, 32, 32))
-        # trace both
-        dest_tracker = Tracker(our_model)
-        dest_traced = dest_tracker(x).parametrized
-
-        pprint(dest_tracker.name2module)
-        src_tracker = Tracker(from_model)
-        src_traced = src_tracker(x).parametrized
-
-    # convert the keys -> module dict to keys -> params
-    def to_params_dict(dict_with_modules):
-        params_dict = OrderedDict()
-        for name, module in dict_with_modules.items():
-            for param_name, param in module.state_dict().items():
-                params_dict[f"{name}.{param_name}"] = param
-        return params_dict
-
-    from_to_ours_keys = {}
-
-    src_state_dict = to_params_dict(src_traced)
-    dst_state_dict = to_params_dict(dest_traced)
-
-    for (src_key, src_param), (dest_key, dest_param) in zip(src_state_dict.items(), dst_state_dict.items()):
-        from_to_ours_keys[src_key] = dest_key
-        logger.info(f"{src_key} -> {dest_key}")
-    # if "in1k" was in the model_name it means it must have a classification head (was finetuned)
-    if "in1k" in model_name:
-        from_to_ours_keys["0.clf.0.weight"] = "classifier.1.weight"
-        from_to_ours_keys["0.clf.0.bias"] = "classifier.1.bias"
-
-    return from_to_ours_keys
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-        # finetuned on imagenet
-        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-    }
-
-    # add seer weights logic
-    def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
-        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
-        # check if we have a head, if yes add it
-        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
-        return model_state_dict["trunk"], model_state_dict["heads"]
-
-    names_to_from_model = {
-        "regnet-y-10b-seer": partial(
-            load_using_classy_vision,
-            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
-        ),
-        "regnet-y-10b-seer-in1k": partial(
-            load_using_classy_vision,
-            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
-        ),
-    }
-
-    from_to_ours_keys = get_from_to_our_keys(model_name)
-
-    if not (save_directory / f"{model_name}.pth").exists():
-        logger.info("Loading original state_dict.")
-        from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]()
-        from_state_dict = from_state_dict_trunk
-        if "in1k" in model_name:
-            # add the head
-            from_state_dict = {**from_state_dict_trunk, **from_state_dict_head}
-        logger.info("Done!")
-
-        converted_state_dict = {}
-
-        not_used_keys = list(from_state_dict.keys())
-        regex = r"\.block.-part."
-        # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
-        for key in from_state_dict.keys():
-            # remove the weird "block[0,1]-part" from the key
-            src_key = re.sub(regex, "", key)
-            # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
-            dest_key = from_to_ours_keys[src_key]
-            # store the parameter with our key
-            converted_state_dict[dest_key] = from_state_dict[key]
-            not_used_keys.remove(key)
-        # check that all keys have been updated
-        assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}"
-
-        logger.info(f"The following keys were not used: {','.join(not_used_keys)}")
-
-        # save our state dict to disk
-        torch.save(converted_state_dict, save_directory / f"{model_name}.pth")
-
-        del converted_state_dict
-    else:
-        logger.info("The state_dict was already stored on disk.")
-    if push_to_hub:
-        logger.info(f"Token is {os.environ['HF_TOKEN']}")
-        logger.info("Loading our model.")
-        # create our model
-        our_config = names_to_config[model_name]
-        our_model_func = RegNetModel
-        if "in1k" in model_name:
-            our_model_func = RegNetForImageClassification
-        our_model = our_model_func(our_config)
-        # place our model to the meta device (so remove all the weights)
-        our_model.to(torch.device("meta"))
-        logger.info("Loading state_dict in our model.")
-        # load state dict
-        state_dict_keys = our_model.state_dict().keys()
-        PreTrainedModel._load_pretrained_model_low_mem(
-            our_model, state_dict_keys, [save_directory / f"{model_name}.pth"]
-        )
-        logger.info("Finally, pushing!")
-        # push it to hub
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            output_dir=save_directory / model_name,
-        )
-        size = 384
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add image processor",
-            output_dir=save_directory / model_name,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported regnet* architecture,"
-            " currently: regnetx-*, regnety-*. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
deleted file mode 100644
index 38158b682cb5..000000000000
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RegNet checkpoints from timm and vissl."""
-
-import argparse
-import json
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import Callable, Dict, List, Tuple
-
-import timm
-import torch
-import torch.nn as nn
-from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-from vissl.models.model_helpers import get_trunk_forward_outputs
-
-from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 1
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-    raise_if_mismatch: bool = True
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced) and self.raise_if_mismatch:
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-class FakeRegNetVisslWrapper(nn.Module):
-    """
-    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
-    """
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-
-        feature_blocks: List[Tuple[str, nn.Module]] = []
-        # - get the stem
-        feature_blocks.append(("conv1", model.stem))
-        # - get all the feature blocks
-        for k, v in model.trunk_output.named_children():
-            assert k.startswith("block"), f"Unexpected layer name {k}"
-            block_index = len(feature_blocks) + 1
-            feature_blocks.append((f"res{block_index}", v))
-
-        self._feature_blocks = nn.ModuleDict(feature_blocks)
-
-    def forward(self, x: Tensor):
-        return get_trunk_forward_outputs(
-            x,
-            out_feat_keys=None,
-            feature_blocks=self._feature_blocks,
-        )
-
-
-class NameToFromModelFuncMap(dict):
-    """
-    A Dictionary with some additional logic to return a function that creates the correct original model.
-    """
-
-    def convert_name_to_timm(self, x: str) -> str:
-        x_split = x.split("-")
-        return x_split[0] + x_split[1] + "_" + "".join(x_split[2:])
-
-    def __getitem__(self, x: str) -> Callable[[], Tuple[nn.Module, Dict]]:
-        # default to timm!
-        if x not in self:
-            x = self.convert_name_to_timm(x)
-            val = partial(lambda: (timm.create_model(x, pretrained=True).eval(), None))
-
-        else:
-            val = super().__getitem__(x)
-
-        return val
-
-
-class NameToOurModelFuncMap(dict):
-    """
-    A Dictionary with some additional logic to return the correct hugging face RegNet class reference.
-    """
-
-    def __getitem__(self, x: str) -> Callable[[], nn.Module]:
-        if "seer" in x and "in1k" not in x:
-            val = RegNetModel
-        else:
-            val = RegNetForImageClassification
-        return val
-
-
-def manually_copy_vissl_head(from_state_dict, to_state_dict, keys: List[Tuple[str, str]]):
-    for from_key, to_key in keys:
-        to_state_dict[to_key] = from_state_dict[from_key].clone()
-        print(f"Copied key={from_key} to={to_key}")
-    return to_state_dict
-
-
-def convert_weight_and_push(
-    name: str,
-    from_model_func: Callable[[], nn.Module],
-    our_model_func: Callable[[], nn.Module],
-    config: RegNetConfig,
-    save_directory: Path,
-    push_to_hub: bool = True,
-):
-    print(f"Converting {name}...")
-    with torch.no_grad():
-        from_model, from_state_dict = from_model_func()
-        our_model = our_model_func(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model, raise_if_mismatch=False)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-
-    if from_state_dict is not None:
-        keys = []
-        # for seer - in1k finetuned we have to manually copy the head
-        if "seer" in name and "in1k" in name:
-            keys = [("0.clf.0.weight", "classifier.1.weight"), ("0.clf.0.bias", "classifier.1.bias")]
-        to_state_dict = manually_copy_vissl_head(from_state_dict, our_model.state_dict(), keys)
-        our_model.load_state_dict(to_state_dict)
-
-    our_outputs = our_model(x, output_hidden_states=True)
-    our_output = (
-        our_outputs.logits if isinstance(our_model, RegNetForImageClassification) else our_outputs.last_hidden_state
-    )
-
-    from_output = from_model(x)
-    from_output = from_output[-1] if isinstance(from_output, list) else from_output
-
-    # now since I don't want to use any config files, vissl seer model doesn't actually have an head, so let's just check the last hidden state
-    if "seer" in name and "in1k" in name:
-        our_output = our_outputs.hidden_states[-1]
-
-    assert torch.allclose(from_output, our_output), "The model logits don't match the original one."
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        size = 224 if "seer" not in name else 384
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "regnet-x-002": ImageNetPreTrainedConfig(
-            depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8, layer_type="x"
-        ),
-        "regnet-x-004": ImageNetPreTrainedConfig(
-            depths=[1, 2, 7, 12], hidden_sizes=[32, 64, 160, 384], groups_width=16, layer_type="x"
-        ),
-        "regnet-x-006": ImageNetPreTrainedConfig(
-            depths=[1, 3, 5, 7], hidden_sizes=[48, 96, 240, 528], groups_width=24, layer_type="x"
-        ),
-        "regnet-x-008": ImageNetPreTrainedConfig(
-            depths=[1, 3, 7, 5], hidden_sizes=[64, 128, 288, 672], groups_width=16, layer_type="x"
-        ),
-        "regnet-x-016": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 2], hidden_sizes=[72, 168, 408, 912], groups_width=24, layer_type="x"
-        ),
-        "regnet-x-032": ImageNetPreTrainedConfig(
-            depths=[2, 6, 15, 2], hidden_sizes=[96, 192, 432, 1008], groups_width=48, layer_type="x"
-        ),
-        "regnet-x-040": ImageNetPreTrainedConfig(
-            depths=[2, 5, 14, 2], hidden_sizes=[80, 240, 560, 1360], groups_width=40, layer_type="x"
-        ),
-        "regnet-x-064": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 1], hidden_sizes=[168, 392, 784, 1624], groups_width=56, layer_type="x"
-        ),
-        "regnet-x-080": ImageNetPreTrainedConfig(
-            depths=[2, 5, 15, 1], hidden_sizes=[80, 240, 720, 1920], groups_width=120, layer_type="x"
-        ),
-        "regnet-x-120": ImageNetPreTrainedConfig(
-            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112, layer_type="x"
-        ),
-        "regnet-x-160": ImageNetPreTrainedConfig(
-            depths=[2, 6, 13, 1], hidden_sizes=[256, 512, 896, 2048], groups_width=128, layer_type="x"
-        ),
-        "regnet-x-320": ImageNetPreTrainedConfig(
-            depths=[2, 7, 13, 1], hidden_sizes=[336, 672, 1344, 2520], groups_width=168, layer_type="x"
-        ),
-        # y variant
-        "regnet-y-002": ImageNetPreTrainedConfig(depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8),
-        "regnet-y-004": ImageNetPreTrainedConfig(
-            depths=[1, 3, 6, 6], hidden_sizes=[48, 104, 208, 440], groups_width=8
-        ),
-        "regnet-y-006": ImageNetPreTrainedConfig(
-            depths=[1, 3, 7, 4], hidden_sizes=[48, 112, 256, 608], groups_width=16
-        ),
-        "regnet-y-008": ImageNetPreTrainedConfig(
-            depths=[1, 3, 8, 2], hidden_sizes=[64, 128, 320, 768], groups_width=16
-        ),
-        "regnet-y-016": ImageNetPreTrainedConfig(
-            depths=[2, 6, 17, 2], hidden_sizes=[48, 120, 336, 888], groups_width=24
-        ),
-        "regnet-y-032": ImageNetPreTrainedConfig(
-            depths=[2, 5, 13, 1], hidden_sizes=[72, 216, 576, 1512], groups_width=24
-        ),
-        "regnet-y-040": ImageNetPreTrainedConfig(
-            depths=[2, 6, 12, 2], hidden_sizes=[128, 192, 512, 1088], groups_width=64
-        ),
-        "regnet-y-064": ImageNetPreTrainedConfig(
-            depths=[2, 7, 14, 2], hidden_sizes=[144, 288, 576, 1296], groups_width=72
-        ),
-        "regnet-y-080": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 1], hidden_sizes=[168, 448, 896, 2016], groups_width=56
-        ),
-        "regnet-y-120": ImageNetPreTrainedConfig(
-            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112
-        ),
-        "regnet-y-160": ImageNetPreTrainedConfig(
-            depths=[2, 4, 11, 1], hidden_sizes=[224, 448, 1232, 3024], groups_width=112
-        ),
-        "regnet-y-320": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
-        ),
-        # models created by SEER -> https://arxiv.org/abs/2202.08360
-        "regnet-y-320-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232),
-        "regnet-y-640-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328),
-        "regnet-y-1280-seer": RegNetConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
-        ),
-        "regnet-y-2560-seer": RegNetConfig(
-            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
-        ),
-        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-        # finetuned on imagenet
-        "regnet-y-320-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
-        ),
-        "regnet-y-640-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328
-        ),
-        "regnet-y-1280-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
-        ),
-        "regnet-y-2560-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
-        ),
-        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-    }
-
-    names_to_ours_model_map = NameToOurModelFuncMap()
-    names_to_from_model_map = NameToFromModelFuncMap()
-    # add seer weights logic
-
-    def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Module]) -> Tuple[nn.Module, Dict]:
-        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
-        model = model_func()
-        # check if we have a head, if yes add it
-        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
-        state_dict = model_state_dict["trunk"]
-        model.load_state_dict(state_dict)
-        return model.eval(), model_state_dict["heads"]
-
-    # pretrained
-    names_to_from_model_map["regnet-y-320-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet32d/seer_regnet32gf_model_iteration244000.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
-    )
-
-    names_to_from_model_map["regnet-y-640-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet64/seer_regnet64gf_model_final_checkpoint_phase0.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
-    )
-
-    names_to_from_model_map["regnet-y-1280-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/swav_ig1b_regnet128Gf_cnstant_bs32_node16_sinkhorn10_proto16k_syncBN64_warmup8k/model_final_checkpoint_phase0.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
-    )
-
-    names_to_from_model_map["regnet-y-10b-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
-        lambda: FakeRegNetVisslWrapper(
-            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-        ),
-    )
-
-    # IN1K finetuned
-    names_to_from_model_map["regnet-y-320-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet32_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
-    )
-
-    names_to_from_model_map["regnet-y-640-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet64_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
-    )
-
-    names_to_from_model_map["regnet-y-1280-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet128_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
-    )
-
-    names_to_from_model_map["regnet-y-10b-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
-        lambda: FakeRegNetVisslWrapper(
-            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-        ),
-    )
-
-    if model_name:
-        convert_weight_and_push(
-            model_name,
-            names_to_from_model_map[model_name],
-            names_to_ours_model_map[model_name],
-            names_to_config[model_name],
-            save_directory,
-            push_to_hub,
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(
-                model_name,
-                names_to_from_model_map[model_name],
-                names_to_ours_model_map[model_name],
-                config,
-                save_directory,
-                push_to_hub,
-            )
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported regnet* architecture,"
-            " currently: regnetx-*, regnety-*. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py b/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 622d507080e4..000000000000
--- a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RemBERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import RemBertConfig, RemBertModel, load_tf_weights_in_rembert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_rembert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = RemBertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = RemBertModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_rembert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--rembert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained RemBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_rembert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.rembert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
deleted file mode 100644
index feceb74d16ef..000000000000
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ResNet checkpoints from timm."""
-
-import argparse
-import json
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import List
-
-import timm
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-
-from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 0
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced):
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True):
-    print(f"Converting {name}...")
-    with torch.no_grad():
-        from_model = timm.create_model(name, pretrained=True).eval()
-        our_model = ResNetForImageClassification(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-
-    assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one."
-
-    checkpoint_name = f"resnet{'-'.join(name.split('resnet'))}"
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(ResNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "resnet18": ImageNetPreTrainedConfig(
-            depths=[2, 2, 2, 2], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
-        ),
-        "resnet26": ImageNetPreTrainedConfig(
-            depths=[2, 2, 2, 2], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet34": ImageNetPreTrainedConfig(
-            depths=[3, 4, 6, 3], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
-        ),
-        "resnet50": ImageNetPreTrainedConfig(
-            depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet101": ImageNetPreTrainedConfig(
-            depths=[3, 4, 23, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet152": ImageNetPreTrainedConfig(
-            depths=[3, 8, 36, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(model_name, names_to_config[model_name], save_directory, push_to_hub)
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(model_name, config, save_directory, push_to_hub)
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
-            " currently: resnet18,26,34,50,101,152. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c0e6bf94d2eb..000000000000
--- a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-import pathlib
-
-import fairseq
-import torch
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_roberta_checkpoint_to_pytorch(
-    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
-    roberta.eval()  # disable dropout
-    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
-    config = RobertaConfig(
-        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=roberta.args.encoder_embed_dim,
-        num_hidden_layers=roberta.args.encoder_layers,
-        num_attention_heads=roberta.args.encoder_attention_heads,
-        intermediate_size=roberta.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our BERT config:", config)
-
-    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c RoBERTa doesn't use them.
-    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
-    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            roberta_layer.self_attn.k_proj.weight.data.shape
-            == roberta_layer.self_attn.q_proj.weight.data.shape
-            == roberta_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index b8491db08b18..000000000000
--- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa-PreLayerNorm checkpoint."""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_roberta_prelayernorm_checkpoint_to_pytorch(checkpoint_repo: str, pytorch_dump_folder_path: str):
-    """
-    Copy/paste/tweak roberta_prelayernorm's weights to our BERT structure.
-    """
-    # convert configuration
-    config = RobertaPreLayerNormConfig.from_pretrained(
-        checkpoint_repo, architectures=["RobertaPreLayerNormForMaskedLM"]
-    )
-
-    # convert state_dict
-    original_state_dict = torch.load(hf_hub_download(repo_id=checkpoint_repo, filename="pytorch_model.bin"))
-    state_dict = {}
-    for tensor_key, tensor_value in original_state_dict.items():
-        # The transformer implementation gives the model a unique name, rather than overwiriting 'roberta'
-        if tensor_key.startswith("roberta."):
-            tensor_key = "roberta_prelayernorm." + tensor_key[len("roberta.") :]
-
-        # The original implementation contains weights which are not used, remove them from the state_dict
-        if tensor_key.endswith(".self.LayerNorm.weight") or tensor_key.endswith(".self.LayerNorm.bias"):
-            continue
-
-        state_dict[tensor_key] = tensor_value
-
-    model = RobertaPreLayerNormForMaskedLM.from_pretrained(
-        pretrained_model_name_or_path=None, config=config, state_dict=state_dict
-    )
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    # convert tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint_repo)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint-repo",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump, e.g. 'andreasmadsen/efficient_mlm_m0.40'.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_roberta_prelayernorm_checkpoint_to_pytorch(args.checkpoint_repo, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index d227948e0ee3..000000000000
--- a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoFormer checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import RoFormerConfig, RoFormerForMaskedLM, load_tf_weights_in_roformer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = RoFormerConfig.from_json_file(bert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = RoFormerForMaskedLM(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_roformer(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path, _use_new_zipfile_serialization=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
deleted file mode 100644
index 9f2271930e13..000000000000
--- a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
+++ /dev/null
@@ -1,782 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RT Detr checkpoints with Timm backbone"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import RTDetrConfig, RTDetrForObjectDetection, RTDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_rt_detr_config(model_name: str) -> RTDetrConfig:
-    config = RTDetrConfig()
-
-    config.num_labels = 80
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-mmdet-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    if model_name == "rtdetr_r18vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [2, 2, 2, 2]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 3
-    elif model_name == "rtdetr_r34vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [3, 4, 6, 3]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 4
-    elif model_name == "rtdetr_r50vd_m":
-        pass
-    elif model_name == "rtdetr_r50vd":
-        pass
-    elif model_name == "rtdetr_r101vd":
-        config.backbone_config.depths = [3, 4, 23, 3]
-        config.encoder_ffn_dim = 2048
-        config.encoder_hidden_dim = 384
-        config.decoder_in_channels = [384, 384, 384]
-    elif model_name == "rtdetr_r18vd_coco_o365":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [2, 2, 2, 2]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 3
-    elif model_name == "rtdetr_r50vd_coco_o365":
-        pass
-    elif model_name == "rtdetr_r101vd_coco_o365":
-        config.backbone_config.depths = [3, 4, 23, 3]
-        config.encoder_ffn_dim = 2048
-        config.encoder_hidden_dim = 384
-        config.decoder_in_channels = [384, 384, 384]
-
-    return config
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    last_key = ["weight", "bias", "running_mean", "running_var"]
-
-    for level in range(3):
-        rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
-        for last in last_key:
-            rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
-
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                if stage_idx == 0:
-                    rename_keys.append(
-                        (
-                            f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
-                            f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
-                        )
-                    )
-                    for last in last_key:
-                        rename_keys.append(
-                            (
-                                f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
-                                f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
-                            )
-                        )
-                else:
-                    rename_keys.append(
-                        (
-                            f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
-                            f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
-                        )
-                    )
-                    for last in last_key:
-                        rename_keys.append(
-                            (
-                                f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
-                                f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
-                            )
-                        )
-
-            rename_keys.append(
-                (
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
-                )
-            )
-            for last in last_key:
-                rename_keys.append((
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
-                    ))
-
-            rename_keys.append(
-                (
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
-                )
-            )
-            for last in last_key:
-                rename_keys.append((
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
-                    ))
-
-            # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
-            if config.backbone_config.layer_type != "basic":
-                rename_keys.append(
-                    (
-                        f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append((
-                        f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
-                        ))
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
-                f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
-                f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear1.weight",
-                f"model.encoder.encoder.{i}.layers.0.fc1.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear1.bias",
-                f"model.encoder.encoder.{i}.layers.0.fc1.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear2.weight",
-                f"model.encoder.encoder.{i}.layers.0.fc2.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear2.bias",
-                f"model.encoder.encoder.{i}.layers.0.fc2.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm1.weight",
-                f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm1.bias",
-                f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm2.weight",
-                f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm2.bias",
-                f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
-            )
-        )
-
-    for j in range(0, 3):
-        rename_keys.append((f"encoder.input_proj.{j}.0.weight", f"model.encoder_input_proj.{j}.0.weight"))
-        for last in last_key:
-            rename_keys.append((f"encoder.input_proj.{j}.1.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
-
-    block_levels = 3 if config.backbone_config.layer_type != "basic" else 4
-
-    for i in range(len(config.encoder_in_channels) - 1):
-        # encoder layers: hybridencoder parts
-        for j in range(1, block_levels):
-            rename_keys.append(
-                (f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
-            )
-            for last in last_key:
-                rename_keys.append(
-                    (
-                        f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
-                        f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
-                    )
-                )
-
-        rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
-        for last in last_key:
-            rename_keys.append(
-                (f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
-            )
-
-        for j in range(3):
-            for k in range(1, 3):
-                rename_keys.append(
-                    (
-                        f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                        f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append(
-                        (
-                            f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                            f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                        )
-                    )
-
-        for j in range(1, block_levels):
-            rename_keys.append(
-                (f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
-            )
-            for last in last_key:
-                rename_keys.append(
-                    (
-                        f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
-                        f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
-                    )
-                )
-
-        for j in range(3):
-            for k in range(1, 3):
-                rename_keys.append(
-                    (
-                        f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                        f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append(
-                        (
-                            f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                            f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                        )
-                    )
-
-        rename_keys.append(
-            (f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
-        )
-        for last in last_key:
-            rename_keys.append(
-                (f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
-            )
-
-    for i in range(config.decoder_layers):
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"model.decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
-                f"model.decoder.layers.{i}.self_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
-                f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
-                f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
-                f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
-                f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
-                f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
-                f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
-                f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
-                f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
-        )
-
-    for i in range(config.decoder_layers):
-        # decoder + class and bounding box heads
-        rename_keys.append(
-            (
-                f"decoder.dec_score_head.{i}.weight",
-                f"model.decoder.class_embed.{i}.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_score_head.{i}.bias",
-                f"model.decoder.class_embed.{i}.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.0.weight",
-                f"model.decoder.bbox_embed.{i}.layers.0.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.0.bias",
-                f"model.decoder.bbox_embed.{i}.layers.0.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.1.weight",
-                f"model.decoder.bbox_embed.{i}.layers.1.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.1.bias",
-                f"model.decoder.bbox_embed.{i}.layers.1.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.2.weight",
-                f"model.decoder.bbox_embed.{i}.layers.2.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.2.bias",
-                f"model.decoder.bbox_embed.{i}.layers.2.bias",
-            )
-        )
-
-    # decoder projection
-    for i in range(len(config.decoder_in_channels)):
-        rename_keys.append(
-            (
-                f"decoder.input_proj.{i}.conv.weight",
-                f"model.decoder_input_proj.{i}.0.weight",
-            )
-        )
-        for last in last_key:
-            rename_keys.append(
-                (
-                    f"decoder.input_proj.{i}.norm.{last}",
-                    f"model.decoder_input_proj.{i}.1.{last}",
-                )
-            )
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
-            ("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
-            ("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
-            ("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
-            ("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
-            ("decoder.enc_output.0.weight", "model.enc_output.0.weight"),
-            ("decoder.enc_output.0.bias", "model.enc_output.0.bias"),
-            ("decoder.enc_output.1.weight", "model.enc_output.1.weight"),
-            ("decoder.enc_output.1.bias", "model.enc_output.1.bias"),
-            ("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
-            ("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
-            ("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
-            ("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
-            ("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
-            ("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
-            ("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
-            ("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    try:
-        val = state_dict.pop(old)
-        state_dict[new] = val
-    except Exception:
-        pass
-
-
-def read_in_q_k_v(state_dict, config):
-    prefix = ""
-    encoder_hidden_dim = config.encoder_hidden_dim
-
-    # first: transformer encoder
-    for i in range(config.encoder_layers):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
-            :encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
-            encoder_hidden_dim : 2 * encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
-            encoder_hidden_dim : 2 * encoder_hidden_dim
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
-            -encoder_hidden_dim:, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
-    """
-    Copy/paste/tweak model's weights to our RTDETR structure.
-    """
-
-    # load default config
-    config = get_rt_detr_config(model_name)
-
-    # load original model from torch hub
-    model_name_to_checkpoint_url = {
-        "rtdetr_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth",
-        "rtdetr_r34vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth",
-        "rtdetr_r50vd_m": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth",
-        "rtdetr_r50vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth",
-        "rtdetr_r101vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth",
-        "rtdetr_r18vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth",
-        "rtdetr_r50vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth",
-        "rtdetr_r101vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth",
-    }
-    logger.info(f"Converting model {model_name}...")
-    state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
-        "ema"
-    ]["module"]
-
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, config)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
-        if key.endswith("num_batches_tracked"):
-            del state_dict[key]
-        # for two_stage
-        if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
-            state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
-
-    # finally, create HuggingFace model and load state dict
-    model = RTDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # load image processor
-    image_processor = RTDetrImageProcessor()
-
-    # prepare image
-    img = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.ToTensor(),
-        ]
-    )
-    original_pixel_values = transformations(img).unsqueeze(0)  # insert batch dimension
-
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    pixel_values = pixel_values.to(device)
-
-    # Pass image by the model
-    outputs = model(pixel_values)
-
-    if model_name == "rtdetr_r18vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.3364253, -6.465683, -3.6130402],
-                [-4.083815, -6.4039373, -6.97881],
-                [-4.192215, -7.3410473, -6.9027247],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.16868353, 0.19833282, 0.21182671],
-                [0.25559652, 0.55121744, 0.47988364],
-                [0.7698693, 0.4124569, 0.46036878],
-            ]
-        )
-    elif model_name == "rtdetr_r34vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.3727384, -4.7921476, -5.7299604],
-                [-4.840536, -8.455345, -4.1745796],
-                [-4.1277084, -5.2154565, -5.7852697],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.258278, 0.5497808, 0.4732004],
-                [0.16889669, 0.19890057, 0.21138911],
-                [0.76632994, 0.4147879, 0.46851268],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd_m":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.319764, -6.1349025, -6.094794],
-                [-5.1056995, -7.744766, -4.803956],
-                [-4.7685347, -7.9278393, -4.5751696],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2582739, 0.55071366, 0.47660282],
-                [0.16811174, 0.19954777, 0.21292639],
-                [0.54986024, 0.2752091, 0.0561416],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6476398, -5.001154, -4.9785104],
-                [-4.1593494, -4.7038546, -5.946485],
-                [-4.4374595, -4.658361, -6.2352347],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.16880608, 0.19992264, 0.21225442],
-                [0.76837635, 0.4122631, 0.46368608],
-                [0.2595386, 0.5483334, 0.4777486],
-            ]
-        )
-    elif model_name == "rtdetr_r101vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6162, -4.9189, -4.6656],
-                [-4.4701, -4.4997, -4.9659],
-                [-5.6641, -7.9000, -5.0725],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.7707, 0.4124, 0.4585],
-                [0.2589, 0.5492, 0.4735],
-                [0.1688, 0.1993, 0.2108],
-            ]
-        )
-    elif model_name == "rtdetr_r18vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.8726, -5.9066, -5.2450],
-                [-4.8157, -6.8764, -5.1656],
-                [-4.7492, -5.7006, -5.1333],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2552, 0.5501, 0.4773],
-                [0.1685, 0.1986, 0.2104],
-                [0.7692, 0.4141, 0.4620],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6491, -3.9252, -5.3163],
-                [-4.1386, -5.0348, -3.9016],
-                [-4.4778, -4.5423, -5.7356],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2583, 0.5492, 0.4747],
-                [0.5501, 0.2754, 0.0574],
-                [0.7693, 0.4137, 0.4613],
-            ]
-        )
-    elif model_name == "rtdetr_r101vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.5152, -5.6811, -5.7311],
-                [-4.5358, -7.2422, -5.0941],
-                [-4.6919, -5.5834, -6.0145],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.7703, 0.4140, 0.4583],
-                [0.1686, 0.1991, 0.2107],
-                [0.2570, 0.5496, 0.4750],
-            ]
-        )
-    else:
-        raise ValueError(f"Unknown rt_detr_name: {model_name}")
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Upload model, image processor and config to the hub
-        logger.info("Uploading PyTorch model and image processor to the hub...")
-        config.push_to_hub(
-            repo_id=repo_id, commit_message="Add config from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
-        )
-        model.push_to_hub(
-            repo_id=repo_id, commit_message="Add model from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
-        )
-        image_processor.push_to_hub(
-            repo_id=repo_id,
-            commit_message="Add image processor from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py",
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name",
-        default="rtdetr_r50vd",
-        type=str,
-        help="model_name of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        help="repo_id where the model will be pushed to.",
-    )
-    args = parser.parse_args()
-    convert_rt_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
deleted file mode 100644
index a0c97fc4e234..000000000000
--- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
-
-import argparse
-import gc
-import json
-import os
-import re
-
-import torch
-from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME
-
-
-NUM_HIDDEN_LAYERS_MAPPING = {
-    "169M": 12,
-    "430M": 24,
-    "1B5": 24,
-    "3B": 32,
-    "7B": 32,
-    "14B": 40,
-}
-
-HIDEN_SIZE_MAPPING = {
-    "169M": 768,
-    "430M": 1024,
-    "1B5": 2048,
-    "3B": 2560,
-    "7B": 4096,
-    "14B": 5120,
-}
-
-
-def convert_state_dict(state_dict):
-    state_dict_keys = list(state_dict.keys())
-    for name in state_dict_keys:
-        weight = state_dict.pop(name)
-        # emb -> embedding
-        if name.startswith("emb."):
-            name = name.replace("emb.", "embeddings.")
-        # ln_0 -> pre_ln (only present at block 0)
-        if name.startswith("blocks.0.ln0"):
-            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
-        # att -> attention
-        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
-        # ffn -> feed_forward
-        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
-        # time_mix_k -> time_mix_key and reshape
-        if name.endswith(".time_mix_k"):
-            name = name.replace(".time_mix_k", ".time_mix_key")
-        # time_mix_v -> time_mix_value and reshape
-        if name.endswith(".time_mix_v"):
-            name = name.replace(".time_mix_v", ".time_mix_value")
-        # time_mix_r -> time_mix_key and reshape
-        if name.endswith(".time_mix_r"):
-            name = name.replace(".time_mix_r", ".time_mix_receptance")
-
-        if name != "head.weight":
-            name = "rwkv." + name
-
-        state_dict[name] = weight
-    return state_dict
-
-
-def convert_rmkv_checkpoint_to_hf_format(
-    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
-):
-    # 1. If possible, build the tokenizer.
-    if tokenizer_file is None:
-        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
-        vocab_size = 50277
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-    else:
-        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
-        vocab_size = len(tokenizer)
-    tokenizer.save_pretrained(output_dir)
-
-    # 2. Build the config
-    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
-    if size is None:
-        # Try to infer size from the checkpoint name
-        for candidate in possible_sizes:
-            if candidate in checkpoint_file:
-                size = candidate
-                break
-        if size is None:
-            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
-    if size not in possible_sizes:
-        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")
-
-    config = RwkvConfig(
-        vocab_size=vocab_size,
-        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
-        hidden_size=HIDEN_SIZE_MAPPING[size],
-    )
-    config.save_pretrained(output_dir)
-
-    # 3. Download model file then convert state_dict
-    model_file = hf_hub_download(repo_id, checkpoint_file)
-    state_dict = torch.load(model_file, map_location="cpu")
-    state_dict = convert_state_dict(state_dict)
-
-    # 4. Split in shards and save
-    state_dict_split = split_torch_state_dict_into_shards(state_dict)
-    shards = index = None
-    for tensors in state_dict_split.filename_to_tensors.values():
-        shards = {tensor: state_dict[tensor] for tensor in tensors}
-    if state_dict_split.is_sharded:
-        index = {
-            "metadata": state_dict_split.metadata,
-            "weight_map": state_dict_split.tensor_to_filename,
-        }
-
-    for shard_file, shard in shards.items():
-        torch.save(shard, os.path.join(output_dir, shard_file))
-
-    if index is not None:
-        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
-        # Save the index as well
-        with open(save_index_file, "w", encoding="utf-8") as f:
-            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-            f.write(content)
-
-        # 5. Clean up shards (for some reason the file PyTorch saves take the same space as the whole state_dict
-        print(
-            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
-        )
-        shard_files = list(shards.keys())
-
-        del state_dict
-        del shards
-        gc.collect()
-
-        for shard_file in shard_files:
-            state_dict = torch.load(os.path.join(output_dir, shard_file))
-            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))
-
-    del state_dict
-    gc.collect()
-
-    if push_to_hub:
-        if model_name is None:
-            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
-        model = AutoModelForCausalLM.from_pretrained(output_dir)
-        model.push_to_hub(model_name, max_shard_size="2GB")
-        tokenizer.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
-    )
-    parser.add_argument(
-        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
-    )
-    parser.add_argument(
-        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
-    )
-    parser.add_argument(
-        "--tokenizer_file",
-        default=None,
-        type=str,
-        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
-    )
-    parser.add_argument(
-        "--size",
-        default=None,
-        type=str,
-        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Push to the Hub the converted model.",
-    )
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help="Name of the pushed model on the Hub, including the username / organization.",
-    )
-
-    args = parser.parse_args()
-    convert_rmkv_checkpoint_to_hf_format(
-        args.repo_id,
-        args.checkpoint_file,
-        args.output_dir,
-        size=args.size,
-        tokenizer_file=args.tokenizer_file,
-        push_to_hub=args.push_to_hub,
-        model_name=args.model_name,
-    )
diff --git a/src/transformers/models/sam/convert_sam_to_hf.py b/src/transformers/models/sam/convert_sam_to_hf.py
deleted file mode 100644
index dd8818b68cfc..000000000000
--- a/src/transformers/models/sam/convert_sam_to_hf.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert SAM checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/segment-anything.
-
-Also supports converting the SlimSAM checkpoints from https://github.com/czg1225/SlimSAM/tree/master.
-"""
-
-import argparse
-import re
-
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SamConfig,
-    SamImageProcessor,
-    SamModel,
-    SamProcessor,
-    SamVisionConfig,
-)
-
-
-def get_config(model_name):
-    if "slimsam-50" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=384,
-            mlp_dim=1536,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            global_attn_indexes=[2, 5, 8, 11],
-        )
-    elif "slimsam-77" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=168,
-            mlp_dim=696,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            global_attn_indexes=[2, 5, 8, 11],
-        )
-    elif "sam_vit_b" in model_name:
-        vision_config = SamVisionConfig()
-    elif "sam_vit_l" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=1024,
-            num_hidden_layers=24,
-            num_attention_heads=16,
-            global_attn_indexes=[5, 11, 17, 23],
-        )
-    elif "sam_vit_h" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=1280,
-            num_hidden_layers=32,
-            num_attention_heads=16,
-            global_attn_indexes=[7, 15, 23, 31],
-        )
-
-    config = SamConfig(
-        vision_config=vision_config,
-    )
-
-    return config
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "iou_prediction_head.layers.0": "iou_prediction_head.proj_in",
-    "iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
-    "iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
-    "mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
-    "mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
-    "mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
-    "mask_downscaling.0": "mask_embed.conv1",
-    "mask_downscaling.1": "mask_embed.layer_norm1",
-    "mask_downscaling.3": "mask_embed.conv2",
-    "mask_downscaling.4": "mask_embed.layer_norm2",
-    "mask_downscaling.6": "mask_embed.conv3",
-    "point_embeddings": "point_embed",
-    "pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
-    "image_encoder": "vision_encoder",
-    "neck.0": "neck.conv1",
-    "neck.1": "neck.layer_norm1",
-    "neck.2": "neck.conv2",
-    "neck.3": "neck.layer_norm2",
-    "patch_embed.proj": "patch_embed.projection",
-    ".norm": ".layer_norm",
-    "blocks": "layers",
-}
-
-
-def replace_keys(state_dict):
-    model_state_dict = {}
-    state_dict.pop("pixel_mean", None)
-    state_dict.pop("pixel_std", None)
-
-    output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"
-
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(output_hypernetworks_mlps_pattern, key):
-            layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
-            if layer_nb == 0:
-                key = key.replace("layers.0", "proj_in")
-            elif layer_nb == 1:
-                key = key.replace("layers.1", "layers.0")
-            elif layer_nb == 2:
-                key = key.replace("layers.2", "proj_out")
-
-        model_state_dict[key] = value
-
-    model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
-        "prompt_encoder.shared_embedding.positional_embedding"
-    ]
-
-    return model_state_dict
-
-
-def convert_sam_checkpoint(model_name, checkpoint_path, pytorch_dump_folder, push_to_hub):
-    config = get_config(model_name)
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-    state_dict = replace_keys(state_dict)
-
-    image_processor = SamImageProcessor()
-    processor = SamProcessor(image_processor=image_processor)
-    hf_model = SamModel(config)
-    hf_model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    hf_model.load_state_dict(state_dict)
-    hf_model = hf_model.to(device)
-
-    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-
-    input_points = [[[500, 375]]]
-    input_labels = [[1]]
-
-    inputs = processor(images=np.array(raw_image), return_tensors="pt").to(device)
-
-    with torch.no_grad():
-        output = hf_model(**inputs)
-    scores = output.iou_scores.squeeze()
-
-    if model_name == "sam_vit_b_01ec64":
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-            scores = output.iou_scores.squeeze()
-
-    elif model_name == "sam_vit_h_4b8939":
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.9712603092193604
-
-        input_boxes = ((75, 275, 1725, 850),)
-
-        inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.8686015605926514
-
-        # Test with 2 points and 1 image.
-        input_points = [[[400, 650], [800, 650]]]
-        input_labels = [[1, 1]]
-
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.9936047792434692
-
-    if pytorch_dump_folder is not None:
-        processor.save_pretrained(pytorch_dump_folder)
-        hf_model.save_pretrained(pytorch_dump_folder)
-
-    if push_to_hub:
-        repo_id = f"nielsr/{model_name}" if "slimsam" in model_name else f"meta/{model_name}"
-        processor.push_to_hub(repo_id)
-        hf_model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195", "slimsam-50-uniform", "slimsam-77-uniform"]
-    parser.add_argument(
-        "--model_name",
-        default="sam_vit_h_4b8939",
-        choices=choices,
-        type=str,
-        help="Name of the original model to convert",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=False,
-        help="Path to the original checkpoint",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    if "slimsam" in args.model_name:
-        checkpoint_path = args.checkpoint_path
-        if checkpoint_path is None:
-            raise ValueError("You need to provide a checkpoint path for SlimSAM models.")
-    else:
-        checkpoint_path = hf_hub_download("ybelkada/segment-anything", f"checkpoints/{args.model_name}.pth")
-
-    convert_sam_checkpoint(args.model_name, checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
deleted file mode 100644
index b321af02e73b..000000000000
--- a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Converting Meta SeamlessM4T checkpoints from seamless_communication to HF."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from accelerate.utils.modeling import find_tied_parameters
-from seamless_communication.models.inference.translator import Translator
-
-from transformers import (
-    SeamlessM4TConfig,
-    SeamlessM4TFeatureExtractor,
-    SeamlessM4TModel,
-    SeamlessM4TProcessor,
-    SeamlessM4TTokenizer,
-)
-from transformers.utils import logging
-
-
-UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]  # fmt: skip
-VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]  # fmt: skip
-MEDIUM_SUPPORTED_LANGUAGES = ["ace","ace_Latn","acm","acq","aeb","afr","ajp","aka","amh","apc","arb","ars","ary","arz","asm","ast","awa","ayr","azb","azj","bak","bam","ban","bel","bem","ben","bho","bjn","bjn_Latn","bod","bos","bug","bul","cat","ceb","ces","cjk","ckb","crh","cym","dan","deu","dik","dyu","dzo","ell","eng","epo","est","eus","ewe","fao","pes","fij","fin","fon","fra","fur","fuv","gla","gle","glg","grn","guj","hat","hau","heb","hin","hne","hrv","hun","hye","ibo","ilo","ind","isl","ita","jav","jpn","kab","kac","kam","kan","kas","kas_Deva","kat","knc","knc_Latn","kaz","kbp","kea","khm","kik","kin","kir","kmb","kon","kor","kmr","lao","lvs","lij","lim","lin","lit","lmo","ltg","ltz","lua","lug","luo","lus","mag","mai","mal","mar","min","mkd","plt","mlt","mni","khk","mos","mri","zsm","mya","nld","nno","nob","npi","nso","nus","nya","oci","gaz","ory","pag","pan","pap","pol","por","prs","pbt","quy","ron","run","rus","sag","san","sat","scn","shn","sin","slk","slv","smo","sna","snd","som","sot","spa","als","srd","srp","ssw","sun","swe","swh","szl","tam","tat","tel","tgk","tgl","tha","tir","taq","taq_Tfng","tpi","tsn","tso","tuk","tum","tur","twi","tzm","uig","ukr","umb","urd","uzn","vec","vie","war","wol","xho","ydd","yor","yue","cmn","cmn_Hant","zul",]  # fmt: skip
-LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]  # fmt: skip
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-vocoder_convert_list = [
-    ("ups", "hifi_gan.upsampler"),
-    ("conv_pre", "hifi_gan.conv_pre"),
-    ("resblocks", "hifi_gan.resblocks"),
-    ("conv_post", "hifi_gan.conv_post"),
-    ("lang", "language_embedding"),
-    ("spkr", "speaker_embedding"),
-    ("dict.", "unit_embedding."),
-    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
-    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
-]
-
-# order is important
-wav2vec_convert_list = [
-    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("speech_encoder.inner.layers", "encoder.layers"),
-    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("speech_encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.batch_norm", "conv_module.batch_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
-    ("speech_encoder.layer_norm", "inner_layer_norm"),
-]
-
-t2u_convert_list = [
-    ("t2u_model.final_proj", "lm_head"),
-    ("t2u_model.", "model."),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("decoder_frontend.embed", "decoder.embed_tokens"),
-]
-
-text_convert_list = [
-    ("text_encoder.", ""),
-    ("text_decoder.", ""),
-    ("text_encoder_frontend.embed", "embed_tokens"),
-    ("text_decoder_frontend.embed", "embed_tokens"),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("final_proj", "lm_head"),
-]
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
-
-
-def _load_hf_config(model_type="medium"):
-    if model_type == "medium":
-        kwargs = {
-            "vocab_size": 256206,
-            "t2u_vocab_size": 10082,
-            "hidden_size": 1024,
-            "max_position_embeddings": 4096,
-            "encoder_layers": 12,
-            "decoder_layers": 12,
-            "encoder_ffn_dim": 4096,
-            "decoder_ffn_dim": 4096,
-            "t2u_encoder_layers": 4,
-            "t2u_decoder_layers": 4,
-            "speech_encoder_layers": 12,
-        }
-        return SeamlessM4TConfig(**kwargs)
-    else:
-        return SeamlessM4TConfig()
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-    device,
-    unwanted_prefix="model.",
-    filter_state_dict="speech",
-    exclude_state_dict=None,
-):
-    state_dict = original_model.state_dict()
-
-    # filter func
-    if isinstance(filter_state_dict, str):
-
-        def filter_func(x):
-            return filter_state_dict in x[0]
-
-    else:
-
-        def filter_func(item):
-            if exclude_state_dict is not None and exclude_state_dict in item[0]:
-                return False
-            for filter_el in filter_state_dict:
-                if filter_el in item[0]:
-                    return True
-
-            return False
-
-    state_dict = dict(filter(filter_func, state_dict.items()))
-
-    for k, v in list(state_dict.items()):
-        new_k = k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
-            new_k = new_k.replace("layer_norm", "final_layer_norm")
-
-        state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set(extra_keys)
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=False)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-def load_model(save_dir, model_type, repo_id):
-    """
-    Meta SeamlessM4T is made of 8 main components:
-    - speech_encoder (#1) and speech_encoder_frontend (#2)
-    - t2u_model (#3)
-    - text_encoder (#4) and text_encoder_frontend (#5)
-    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
-    - final_proj (#7)
-    - vocoder (#8)
-    """
-    device = _grab_best_device()
-    if model_type == "medium":
-        name = "seamlessM4T_medium"
-    else:
-        name = "seamlessM4T_large"
-
-    original_model = Translator(name, "vocoder_36langs", device, torch.float32)
-
-    ######### TOKENIZER
-
-    langs = MEDIUM_SUPPORTED_LANGUAGES if model_type == "medium" else LARGE_SUPPORTED_LANGUAGES
-    langs = [f"__{lang}__" for lang in langs]
-    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
-
-    save_dir = os.path.join(save_dir, name)
-    Path(save_dir).mkdir(exist_ok=True)
-
-    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
-
-    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
-
-    tokenizer.save_pretrained(save_dir)
-    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
-
-    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
-        raise ValueError(
-            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
-        )
-
-    ####### get language to ids dict
-    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
-    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
-    t2u_lang_code_to_id = {
-        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
-        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
-    }
-    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
-
-    ######### FE
-
-    fe = SeamlessM4TFeatureExtractor(language_code=langs)
-
-    fe.save_pretrained(save_dir)
-    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
-
-    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
-    processor.save_pretrained(save_dir)
-    processor.push_to_hub(repo_id=repo_id, create_pr=True)
-
-    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
-
-    ######## Model
-
-    # init model
-    hf_config = _load_hf_config(model_type)
-    hf_model = SeamlessM4TModel(hf_config)
-
-    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
-    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
-
-    # -1. take care of vocoder
-    # similarly to speech T5 must apply and remove weight norm
-    hf_model.vocoder.apply_weight_norm()
-    hf_model.vocoder = _convert_model(
-        original_model,
-        hf_model.vocoder,
-        vocoder_convert_list,
-        device,
-        unwanted_prefix="vocoder.code_generator.",
-        filter_state_dict="vocoder",
-    )
-    hf_model.vocoder.remove_weight_norm()
-
-    # 1. take care of speech encoder
-    wav2vec = hf_model.speech_encoder
-    hf_model.speech_encoder = _convert_model(
-        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
-    )
-
-    # 2. take care of t2u
-
-    hf_model.t2u_model = _convert_model(
-        original_model,
-        hf_model.t2u_model,
-        t2u_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict="t2u_model",
-    )
-
-    # 3. take care of text encoder
-    hf_model.text_encoder = _convert_model(
-        original_model,
-        hf_model.text_encoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_encoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 4. take care of text decoder
-    hf_model.text_decoder = _convert_model(
-        original_model,
-        hf_model.text_decoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_decoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 5. take care of final proj
-    hf_model.lm_head = _convert_model(
-        original_model,
-        hf_model.lm_head,
-        [("final_proj.", "")],
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.final_proj"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # sanity check
-    print(find_tied_parameters(hf_model))
-
-    count_1 = param_count(hf_model)
-    count_2 = param_count(original_model)
-
-    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
-    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
-
-    del original_model
-
-    hf_model.generation_config._from_model_config = False
-    hf_model.save_pretrained(save_dir)
-    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
-    hf_model = SeamlessM4TModel.from_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument(
-        "--model_type",
-        default="medium",
-        type=str,
-        help="Model type.",
-    )
-
-    parser.add_argument(
-        "--save_dir",
-        default="/home/ubuntu/weights",
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-
-    parser.add_argument(
-        "--repo_id",
-        default="facebook/hf-seamless-m4t-medium",
-        type=str,
-        help="Repo ID.",
-    )
-
-    args = parser.parse_args()
-
-    load_model(args.save_dir, args.model_type, args.repo_id)
diff --git a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
deleted file mode 100644
index 97a633d05ac6..000000000000
--- a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from accelerate.utils.modeling import find_tied_parameters
-from seamless_communication.inference import Translator
-
-from transformers import (
-    SeamlessM4TFeatureExtractor,
-    SeamlessM4TProcessor,
-    SeamlessM4TTokenizer,
-    SeamlessM4Tv2Config,
-    SeamlessM4Tv2Model,
-)
-from transformers.utils import logging
-
-
-# fmt: off
-UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
-# fmt: on
-
-# fmt: off
-VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
-# fmt: on
-
-# fmt: off
-LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
-# fmt: on
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-vocoder_convert_list = [
-    ("ups", "hifi_gan.upsampler"),
-    ("conv_pre", "hifi_gan.conv_pre"),
-    ("resblocks", "hifi_gan.resblocks"),
-    ("conv_post", "hifi_gan.conv_post"),
-    ("lang", "language_embedding"),
-    ("spkr", "speaker_embedding"),
-    ("dict.", "unit_embedding."),
-    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
-    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
-]
-
-# order is important
-wav2vec_convert_list = [
-    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("speech_encoder.inner.layers", "encoder.layers"),
-    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("speech_encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.batch_norm", "conv_module.batch_norm"),
-    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
-    ("speech_encoder.layer_norm", "inner_layer_norm"),
-]
-
-t2u_convert_list = [
-    ("t2u_model.final_proj", "lm_head"),
-    ("t2u_model.", "model."),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("decoder_frontend.embed_char", "decoder.embed_char"),
-    ("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
-    ("decoder_frontend.embed", "decoder.embed_tokens"),
-    ("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
-    ("conv1d.conv", "conv"),
-    ("conv1d_layer_norm", "conv_layer_norm"),
-    ("decoder_frontend.variance_adaptor", "decoder"),
-    ("duration_predictor.conv1.0", "duration_predictor.conv1"),
-    ("duration_predictor.conv2.0", "duration_predictor.conv2"),
-]
-
-text_convert_list = [
-    ("text_encoder.", ""),
-    ("text_decoder.", ""),
-    ("text_encoder_frontend.embed", "embed_tokens"),
-    ("text_decoder_frontend.embed", "embed_tokens"),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("final_proj", "lm_head"),
-]
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
-
-
-def _load_hf_config():
-    return SeamlessM4Tv2Config()
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-    device,
-    unwanted_prefix="model.",
-    filter_state_dict="speech",
-    exclude_state_dict=None,
-):
-    state_dict = original_model.state_dict()
-
-    # filter func
-    if isinstance(filter_state_dict, str):
-
-        def filter_func(x):
-            return filter_state_dict in x[0]
-
-    else:
-
-        def filter_func(item):
-            if exclude_state_dict is not None and exclude_state_dict in item[0]:
-                return False
-            for filter_el in filter_state_dict:
-                if filter_el in item[0]:
-                    return True
-
-            return False
-
-    state_dict = dict(filter(filter_func, state_dict.items()))
-
-    for k, v in list(state_dict.items()):
-        new_k = k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
-            new_k = new_k.replace("layer_norm", "final_layer_norm")
-
-        state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set(extra_keys)
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=False)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-def load_model(save_dir, model_type, repo_id):
-    """
-    Meta SeamlessM4Tv2 is made of 8 main components:
-    - speech_encoder (#1) and speech_encoder_frontend (#2)
-    - t2u_model (#3)
-    - text_encoder (#4) and text_encoder_frontend (#5)
-    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
-    - final_proj (#7)
-    - vocoder (#8)
-    """
-    device = _grab_best_device()
-    name = "seamlessM4T_v2_large"
-
-    original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
-
-    ######### TOKENIZER
-
-    langs = LARGE_SUPPORTED_LANGUAGES
-    langs = [f"__{lang}__" for lang in langs]
-    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
-
-    save_dir = os.path.join(save_dir, name)
-    Path(save_dir).mkdir(exist_ok=True)
-
-    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
-
-    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
-
-    tokenizer.save_pretrained(save_dir)
-    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
-
-    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
-        raise ValueError(
-            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
-        )
-
-    ####### get language to ids dict
-    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
-    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
-    t2u_lang_code_to_id = {
-        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
-        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
-    }
-    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
-
-    ######### FE
-
-    fe = SeamlessM4TFeatureExtractor(language_code=langs)
-
-    fe.save_pretrained(save_dir)
-    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
-
-    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
-    processor.save_pretrained(save_dir)
-    processor.push_to_hub(repo_id=repo_id, create_pr=True)
-
-    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
-
-    ######## Model
-
-    # init config
-    hf_config = _load_hf_config()
-
-    ######## get id_to_text and char_to_id from original model tokenizers
-    id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
-    char_to_id = {
-        original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
-    }
-
-    # init model
-    hf_model = SeamlessM4Tv2Model(hf_config)
-
-    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
-    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("id_to_text", id_to_text)
-    hf_model.generation_config.__setattr__("char_to_id", char_to_id)
-
-    # -1. take care of vocoder
-    # similarly to speech T5 must apply and remove weight norm
-    hf_model.vocoder.apply_weight_norm()
-    hf_model.vocoder = _convert_model(
-        original_model,
-        hf_model.vocoder,
-        vocoder_convert_list,
-        device,
-        unwanted_prefix="vocoder.code_generator.",
-        filter_state_dict="vocoder",
-    )
-    hf_model.vocoder.remove_weight_norm()
-
-    # 1. take care of speech encoder
-    wav2vec = hf_model.speech_encoder
-    hf_model.speech_encoder = _convert_model(
-        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
-    )
-
-    # 2. take care of t2u
-
-    hf_model.t2u_model = _convert_model(
-        original_model,
-        hf_model.t2u_model,
-        t2u_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict="t2u_model",
-    )
-
-    # 3. take care of text encoder
-    hf_model.text_encoder = _convert_model(
-        original_model,
-        hf_model.text_encoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_encoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 4. take care of text decoder
-    hf_model.text_decoder = _convert_model(
-        original_model,
-        hf_model.text_decoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_decoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 5. take care of final proj
-    hf_model.lm_head = _convert_model(
-        original_model,
-        hf_model.lm_head,
-        [("final_proj.", "")],
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.final_proj"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # sanity check
-    print(find_tied_parameters(hf_model))
-
-    count_1 = param_count(hf_model)
-    count_2 = param_count(original_model)
-
-    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
-    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
-
-    del original_model
-
-    hf_model.generation_config._from_model_config = False
-    hf_model.save_pretrained(save_dir)
-    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
-    hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument(
-        "--model_type",
-        default="large",
-        type=str,
-        help="Model type.",
-    )
-
-    parser.add_argument(
-        "--save_dir",
-        default="/home/ubuntu/weights_v2",
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-
-    parser.add_argument(
-        "--repo_id",
-        default="facebook/seamless-m4t-v2-large",
-        type=str,
-        help="Repo ID.",
-    )
-
-    args = parser.parse_args()
-
-    load_model(args.save_dir, args.model_type, args.repo_id)
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
deleted file mode 100644
index dbac5ab6b891..000000000000
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SegFormer checkpoints."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SegformerConfig,
-    SegformerForImageClassification,
-    SegformerForSemanticSegmentation,
-    SegformerImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_keys(state_dict, encoder_only=False):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if encoder_only and not key.startswith("head"):
-            key = "segformer.encoder." + key
-        if key.startswith("backbone"):
-            key = key.replace("backbone", "segformer.encoder")
-        if "patch_embed" in key:
-            # replace for example patch_embed1 by patch_embeddings.0
-            idx = key[key.find("patch_embed") + len("patch_embed")]
-            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
-        if "norm" in key:
-            key = key.replace("norm", "layer_norm")
-        if "segformer.encoder.layer_norm" in key:
-            # replace for example layer_norm1 by layer_norm.0
-            idx = key[key.find("segformer.encoder.layer_norm") + len("segformer.encoder.layer_norm")]
-            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
-        if "layer_norm1" in key:
-            key = key.replace("layer_norm1", "layer_norm_1")
-        if "layer_norm2" in key:
-            key = key.replace("layer_norm2", "layer_norm_2")
-        if "block" in key:
-            # replace for example block1 by block.0
-            idx = key[key.find("block") + len("block")]
-            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
-        if "attn.q" in key:
-            key = key.replace("attn.q", "attention.self.query")
-        if "attn.proj" in key:
-            key = key.replace("attn.proj", "attention.output.dense")
-        if "attn" in key:
-            key = key.replace("attn", "attention.self")
-        if "fc1" in key:
-            key = key.replace("fc1", "dense1")
-        if "fc2" in key:
-            key = key.replace("fc2", "dense2")
-        if "linear_pred" in key:
-            key = key.replace("linear_pred", "classifier")
-        if "linear_fuse" in key:
-            key = key.replace("linear_fuse.conv", "linear_fuse")
-            key = key.replace("linear_fuse.bn", "batch_norm")
-        if "linear_c" in key:
-            # replace for example linear_c4 by linear_c.3
-            idx = key[key.find("linear_c") + len("linear_c")]
-            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
-        if key.startswith("head"):
-            key = key.replace("head", "classifier")
-        new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[
-                config.hidden_sizes[i] :
-            ]
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our SegFormer structure.
-    """
-
-    # load default SegFormer configuration
-    config = SegformerConfig()
-    encoder_only = False
-
-    # set attributes based on model_name
-    repo_id = "huggingface/label-files"
-    if "segformer" in model_name:
-        size = model_name[len("segformer.") : len("segformer.") + 2]
-        if "ade" in model_name:
-            config.num_labels = 150
-            filename = "ade20k-id2label.json"
-            expected_shape = (1, 150, 128, 128)
-        elif "city" in model_name:
-            config.num_labels = 19
-            filename = "cityscapes-id2label.json"
-            expected_shape = (1, 19, 128, 128)
-        else:
-            raise ValueError(f"Model {model_name} not supported")
-    elif "mit" in model_name:
-        encoder_only = True
-        size = model_name[4:6]
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        expected_shape = (1, 1000)
-    else:
-        raise ValueError(f"Model {model_name} not supported")
-
-    # set config attributes
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    if size == "b0":
-        pass
-    elif size == "b1":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 256
-    elif size == "b2":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 4, 6, 3]
-    elif size == "b3":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 4, 18, 3]
-    elif size == "b4":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 8, 27, 3]
-    elif size == "b5":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 6, 40, 3]
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # load image processor (only resize + normalize)
-    image_processor = SegformerImageProcessor(
-        image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-    )
-
-    # prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original state dict
-    if encoder_only:
-        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-    else:
-        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))["state_dict"]
-
-    # rename keys
-    state_dict = rename_keys(state_dict, encoder_only=encoder_only)
-    if not encoder_only:
-        del state_dict["decode_head.conv_seg.weight"]
-        del state_dict["decode_head.conv_seg.bias"]
-
-    # key and value matrices need special treatment
-    read_in_k_v(state_dict, config)
-
-    # create HuggingFace model and load state dict
-    if encoder_only:
-        config.reshape_last_stage = False
-        model = SegformerForImageClassification(config)
-    else:
-        model = SegformerForSemanticSegmentation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # forward pass
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # set expected_slice based on model name
-    # ADE20k checkpoints
-    if model_name == "segformer.b0.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-            ]
-        )
-    elif model_name == "segformer.b1.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]],
-                [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]],
-                [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]],
-            ]
-        )
-    elif model_name == "segformer.b2.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]],
-                [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]],
-                [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]],
-            ]
-        )
-    elif model_name == "segformer.b3.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]],
-                [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]],
-                [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]],
-            ]
-        )
-    elif model_name == "segformer.b4.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]],
-                [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]],
-                [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]],
-            ]
-        )
-    elif model_name == "segformer.b5.640x640.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]],
-                [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]],
-                [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]],
-            ]
-        )
-    # Cityscapes checkpoints
-    elif model_name == "segformer.b0.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]],
-                [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]],
-                [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]],
-            ]
-        )
-    elif model_name == "segformer.b0.512x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]],
-                [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]],
-                [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]],
-            ]
-        )
-    elif model_name == "segformer.b0.640x1280.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [
-                    [-1.1372e01, -1.2787e01, -1.3477e01],
-                    [-1.2536e01, -1.4194e01, -1.4409e01],
-                    [-1.3217e01, -1.4888e01, -1.5327e01],
-                ],
-                [
-                    [-1.4791e01, -1.7122e01, -1.8277e01],
-                    [-1.7163e01, -1.9192e01, -1.9533e01],
-                    [-1.7897e01, -1.9991e01, -2.0315e01],
-                ],
-                [
-                    [7.6723e-01, 4.1921e-01, -7.7878e-02],
-                    [4.7772e-01, 9.5557e-03, -2.8082e-01],
-                    [3.6032e-01, -2.4826e-01, -5.1168e-01],
-                ],
-            ]
-        )
-    elif model_name == "segformer.b0.768x768.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]],
-                [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]],
-                [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]],
-            ]
-        )
-    elif model_name == "segformer.b1.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
-                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
-                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
-            ]
-        )
-    elif model_name == "segformer.b2.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]],
-                [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]],
-                [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]],
-            ]
-        )
-    elif model_name == "segformer.b3.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]],
-                [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]],
-                [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]],
-            ]
-        )
-    elif model_name == "segformer.b4.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]],
-                [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]],
-                [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]],
-            ]
-        )
-    elif model_name == "segformer.b5.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]],
-                [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]],
-                [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]],
-            ]
-        )
-    else:
-        predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", model.config.id2label[predicted_class_idx])
-
-    # verify logits
-    if not encoder_only:
-        assert logits.shape == expected_shape
-        assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)
-
-    # finally, save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="segformer.b0.512x512.ade.160k",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_segformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/seggpt/convert_seggpt_to_hf.py b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
deleted file mode 100644
index d67daeab93d8..000000000000
--- a/src/transformers/models/seggpt/convert_seggpt_to_hf.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SegGPT checkpoints from the original repository.
-
-URL: https://github.com/baaivision/Painter/tree/main/SegGPT
-"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-
-    # rename embedding and its parameters
-    rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("mask_token", "model.embeddings.mask_token"))
-    rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
-    rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
-    rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
-    rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
-    rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))
-
-    # rename decoder and other
-    rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
-    rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
-    rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
-    rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
-    rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
-    rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
-    rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
-    rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
-    rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
-    rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))
-
-    # rename blocks
-    for i in range(config.num_hidden_layers):
-        rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
-        rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
-        rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
-        rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))
-
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))
-
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on spongebob images
-def prepare_input():
-    image_input_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
-    )
-    image_prompt_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
-    )
-    mask_prompt_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
-    )
-
-    image_input = Image.open(requests.get(image_input_url, stream=True).raw)
-    image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
-    mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)
-
-    return image_input, image_prompt, mask_prompt
-
-
-@torch.no_grad()
-def convert_seggpt_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    verify_logits = args.verify_logits
-    push_to_hub = args.push_to_hub
-
-    # Define default GroundingDINO configuation
-    config = SegGptConfig()
-
-    # Load original checkpoint
-    checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    # # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HF model
-    model = SegGptForImageSegmentation(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    input_img, prompt_img, prompt_mask = prepare_input()
-    image_processor = SegGptImageProcessor()
-    inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")
-
-    expected_prompt_pixel_values = torch.tensor(
-        [
-            [[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
-            [[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
-            [[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
-        ]
-    )
-
-    expected_pixel_values = torch.tensor(
-        [
-            [[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
-            [[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
-            [[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
-        ]
-    )
-
-    expected_prompt_masks = torch.tensor(
-        [
-            [[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
-            [[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
-            [[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
-        ]
-    )
-
-    assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
-    assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
-    assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)
-
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    print(outputs)
-
-    if verify_logits:
-        expected_output = torch.tensor(
-            [
-                [[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
-                [[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
-                [[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
-            ]
-        )
-        assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
-        print("Looks good!")
-    else:
-        print("Converted without verifying logits")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"EduardoPacheco/{model_name}")
-        image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="seggpt-vit-large",
-        type=str,
-        choices=["seggpt-vit-large"],
-        help="Name of the SegGpt model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        help="Whether or not to verify the logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_seggpt_checkpoint(args)
diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index df0cae2a3b29..000000000000
--- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SEW checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-# Register SEW's fairseq modules
-from sew_asapp import tasks  # noqa: F401
-
-from transformers import (
-    SEWConfig,
-    SEWForCTC,
-    SEWModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.upsample.0": "encoder.upsample.projection",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.sew.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model, is_finetuned):
-    config = SEWConfig()
-    if is_finetuned:
-        fs_config = model.w2v_encoder.w2v_model.cfg
-    else:
-        fs_config = model.cfg
-
-    config.conv_bias = fs_config.conv_bias
-    conv_layers = eval(fs_config.conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn.name
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = fs_config.encoder_layerdrop
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-    config.squeeze_factor = fs_config.squeeze_factor
-
-    # take care of any params that are overridden by the Wav2VecCtc model
-    if is_finetuned:
-        fs_config = model.cfg
-        config.final_dropout = fs_config.final_dropout
-        config.layerdrop = fs_config.layerdrop
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
-    config.attention_dropout = fs_config.attention_dropout
-    config.feat_proj_dropout = fs_config.dropout_input
-    config.hidden_dropout = fs_config.dropout
-    config.mask_feature_length = fs_config.mask_channel_length
-    config.mask_feature_prob = fs_config.mask_channel_prob
-    config.mask_time_length = fs_config.mask_length
-    config.mask_time_prob = fs_config.mask_prob
-
-    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
-    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
-
-    return config
-
-
-@torch.no_grad()
-def convert_sew_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    if config_path is not None:
-        config = SEWConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model[0], is_finetuned)
-    model = model[0].eval()
-
-    return_attention_mask = True if config.feat_extract_norm == "layer" else False
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=return_attention_mask,
-    )
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
-            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_model = SEWForCTC(config)
-    else:
-        hf_model = SEWModel(config)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    recursively_load_weights(model, hf_model, is_finetuned)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_sew_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
-    )
diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 1540efa4be17..000000000000
--- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SEW checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-# Register SEW's fairseq modules
-from sew_asapp import tasks  # noqa: F401
-
-from transformers import (
-    SEWDConfig,
-    SEWDForCTC,
-    SEWDModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "attention.self.query_proj": "encoder.encoder.layer.*.attention.self.query_proj",
-    "attention.self.key_proj": "encoder.encoder.layer.*.attention.self.key_proj",
-    "attention.self.value_proj": "encoder.encoder.layer.*.attention.self.value_proj",
-    "attention.output.dense": "encoder.encoder.layer.*.attention.output.dense",
-    "attention.output.LayerNorm": "encoder.encoder.layer.*.attention.output.LayerNorm",
-    "intermediate.dense": "encoder.encoder.layer.*.intermediate.dense",
-    "output.dense": "encoder.encoder.layer.*.output.dense",
-    "output.LayerNorm": "encoder.encoder.layer.*.output.LayerNorm",
-    "encoder.encoder.rel_embeddings": "encoder.encoder.rel_embeddings",
-    "encoder.encoder.LayerNorm": "encoder.encoder.LayerNorm",
-    "encoder.upsample.0": "encoder.upsample.projection",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.sew_d.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "sew_d." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        if not layer_index.isnumeric():
-                            continue
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model, is_finetuned):
-    config = SEWDConfig()
-    if is_finetuned:
-        fs_config = model.w2v_encoder.w2v_model.cfg
-    else:
-        fs_config = model.cfg
-
-    config.conv_bias = fs_config.conv_bias
-    conv_layers = eval(fs_config.conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn.name
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = fs_config.encoder_layerdrop
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-    config.squeeze_factor = fs_config.squeeze_factor
-    # DeBERTa-specific parameters:
-    config.max_position_embeddings = fs_config.max_position_embeddings
-    config.position_buckets = fs_config.position_buckets
-    config.share_att_key = fs_config.share_att_key
-    config.relative_attention = fs_config.relative_attention
-    config.position_biased_input = fs_config.position_biased_input
-    config.pos_att_type = tuple(fs_config.pos_att_type.split("|"))
-    config.norm_rel_ebd = fs_config.norm_rel_ebd
-
-    # take care of any params that are overridden by the Wav2VecCtc model
-    if is_finetuned:
-        fs_config = model.cfg
-        config.final_dropout = fs_config.final_dropout
-        config.layerdrop = fs_config.layerdrop
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
-    config.attention_dropout = fs_config.attention_dropout
-    config.feat_proj_dropout = fs_config.dropout_input
-    config.hidden_dropout = fs_config.dropout
-    config.mask_feature_length = fs_config.mask_channel_length
-    config.mask_feature_prob = fs_config.mask_channel_prob
-    config.mask_time_length = fs_config.mask_length
-    config.mask_time_prob = fs_config.mask_prob
-
-    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
-    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
-
-    return config
-
-
-@torch.no_grad()
-def convert_sew_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    if config_path is not None:
-        config = SEWDConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model[0], is_finetuned)
-    model = model[0].eval()
-
-    return_attention_mask = True if config.feat_extract_norm == "layer" else False
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=return_attention_mask,
-    )
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
-            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_model = SEWDForCTC(config)
-    else:
-        hf_model = SEWDModel(config)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    recursively_load_weights(model, hf_model, is_finetuned)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_sew_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
-    )
diff --git a/src/transformers/models/siglip/convert_siglip_to_hf.py b/src/transformers/models/siglip/convert_siglip_to_hf.py
deleted file mode 100644
index 163f6f279792..000000000000
--- a/src/transformers/models/siglip/convert_siglip_to_hf.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SigLIP checkpoints from the original repository.
-
-URL: https://github.com/google-research/big_vision/tree/main
-"""
-
-import argparse
-import collections
-from pathlib import Path
-
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from numpy import load
-from PIL import Image
-
-from transformers import SiglipConfig, SiglipImageProcessor, SiglipModel, SiglipProcessor, SiglipTokenizer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-model_name_to_checkpoint = {
-    # base checkpoints
-    "siglip-base-patch16-224": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_224_63724782.npz",
-    "siglip-base-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_256_60500360.npz",
-    "siglip-base-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_384_68578854.npz",
-    "siglip-base-patch16-512": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_512_68580893.npz",
-    # large checkpoints
-    "siglip-large-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_256_60552751.npz",
-    "siglip-large-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_384_63634585.npz",
-    # multilingual checkpoint
-    "siglip-base-patch16-256-i18n": "/Users/nielsrogge/Documents/SigLIP/webli_i18n_b16_256_66117334.npz",
-    # so400m checkpoints
-    "siglip-so400m-patch14-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_so400m_384_58765454.npz",
-}
-
-model_name_to_image_size = {
-    "siglip-base-patch16-224": 224,
-    "siglip-base-patch16-256": 256,
-    "siglip-base-patch16-384": 384,
-    "siglip-base-patch16-512": 512,
-    "siglip-large-patch16-256": 256,
-    "siglip-large-patch16-384": 384,
-    "siglip-base-patch16-256-i18n": 256,
-    "siglip-so400m-patch14-384": 384,
-}
-
-
-def get_siglip_config(model_name):
-    config = SiglipConfig()
-
-    vocab_size = 250000 if "i18n" in model_name else 32000
-    image_size = model_name_to_image_size[model_name]
-    patch_size = 16 if "patch16" in model_name else 14
-
-    # size of the architecture
-    config.vision_config.image_size = image_size
-    config.vision_config.patch_size = patch_size
-    config.text_config.vocab_size = vocab_size
-
-    if "base" in model_name:
-        pass
-    elif "large" in model_name:
-        config.text_config.hidden_size = 1024
-        config.text_config.intermediate_size = 4096
-        config.text_config.num_hidden_layers = 24
-        config.text_config.num_attention_heads = 16
-        config.vision_config.hidden_size = 1024
-        config.vision_config.intermediate_size = 4096
-        config.vision_config.num_hidden_layers = 24
-        config.vision_config.num_attention_heads = 16
-    elif "so400m" in model_name:
-        config.text_config.hidden_size = 1152
-        config.text_config.intermediate_size = 4304
-        config.text_config.num_hidden_layers = 27
-        config.text_config.num_attention_heads = 16
-        config.vision_config.hidden_size = 1152
-        config.vision_config.intermediate_size = 4304
-        config.vision_config.num_hidden_layers = 27
-        config.vision_config.num_attention_heads = 16
-    else:
-        raise ValueError("Model not supported")
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-
-    rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
-
-    rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
-    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
-    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
-
-    # text encoder
-
-    rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
-    rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.text_config.num_hidden_layers):
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
-    rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
-    rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
-    rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
-
-    # learned temperature and bias
-    rename_keys.append(("params/t", "logit_scale"))
-    rename_keys.append(("params/b", "logit_bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new, config):
-    val = dct.pop(old)
-
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if "patch_embedding.weight" in new:
-        val = val.transpose(3, 2, 0, 1)
-    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
-        val = val.T
-
-    if "position_embedding" in new and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if "position_embedding" in new and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if new.endswith("bias"):
-        val = val.reshape(-1)
-
-    dct[new] = torch.from_numpy(val)
-
-
-def read_in_q_k_v_head(state_dict, config):
-    # read in individual input projection layers
-    key_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
-    value_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
-    query_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)
-
-    # next, add them to the state dict as a single matrix + vector
-    state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
-        np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
-    )
-    state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
-        np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
-    )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our SigLIP structure.
-    """
-
-    # define default SigLIP configuration
-    config = get_siglip_config(model_name)
-
-    # get checkpoint
-    checkpoint = model_name_to_checkpoint[model_name]
-
-    # get vocab file
-    if "i18n" in model_name:
-        vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
-    else:
-        vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"
-
-    # load original state dict
-    data = load(checkpoint)
-    state_dict = flatten_nested_dict(data)
-
-    # remove and rename some keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest, config)
-
-    # qkv matrices of attention pooling head need special treatment
-    read_in_q_k_v_head(state_dict, config)
-
-    # load HuggingFace model
-    model = SiglipModel(config).eval()
-    model.load_state_dict(state_dict)
-
-    # create processor
-    # important: make tokenizer not return attention_mask since original one doesn't require it
-    image_size = config.vision_config.image_size
-    size = {"height": image_size, "width": image_size}
-    image_processor = SiglipImageProcessor(size=size)
-    tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
-    processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # verify on dummy images and texts
-    url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
-    image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
-    url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
-    image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
-    texts = ["an apple", "a picture of an apple"]
-
-    inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", padding="max_length")
-
-    # verify input_ids against original ones
-    if image_size == 224:
-        filename = "siglip_pixel_values.pt"
-    elif image_size == 256:
-        filename = "siglip_pixel_values_256.pt"
-    elif image_size == 384:
-        filename = "siglip_pixel_values_384.pt"
-    elif image_size == 512:
-        filename = "siglip_pixel_values_512.pt"
-    else:
-        raise ValueError("Image size not supported")
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
-    original_pixel_values = torch.load(filepath)
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
-    original_input_ids = torch.load(filepath)
-
-    if "i18n" not in model_name:
-        assert inputs.input_ids.tolist() == original_input_ids.tolist()
-
-    print("Mean of original pixel values:", original_pixel_values.mean())
-    print("Mean of new pixel values:", inputs.pixel_values.mean())
-
-    # note: we're testing with original pixel values here since we don't have exact pixel values
-    with torch.no_grad():
-        outputs = model(input_ids=inputs.input_ids, pixel_values=original_pixel_values)
-
-    # with torch.no_grad():
-    #     outputs = model(input_ids=inputs.input_ids, pixel_values=inputs.pixel_values)
-
-    print(outputs.logits_per_image[:3, :3])
-
-    probs = torch.sigmoid(outputs.logits_per_image)  # these are the probabilities
-    print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
-    print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")
-
-    if verify_logits:
-        if model_name == "siglip-base-patch16-224":
-            expected_slice = torch.tensor(
-                [[-2.9621, -2.1672], [-0.2713, 0.2910]],
-            )
-        elif model_name == "siglip-base-patch16-256":
-            expected_slice = torch.tensor(
-                [[-3.1146, -1.9894], [-0.7312, 0.6387]],
-            )
-        elif model_name == "siglip-base-patch16-384":
-            expected_slice = torch.tensor(
-                [[-2.8098, -2.1891], [-0.4242, 0.4102]],
-            )
-        elif model_name == "siglip-base-patch16-512":
-            expected_slice = torch.tensor(
-                [[-2.7899, -2.2668], [-0.4295, -0.0735]],
-            )
-        elif model_name == "siglip-large-patch16-256":
-            expected_slice = torch.tensor(
-                [[-1.5827, -0.5801], [-0.9153, 0.1363]],
-            )
-        elif model_name == "siglip-large-patch16-384":
-            expected_slice = torch.tensor(
-                [[-2.1523, -0.2899], [-0.2959, 0.7884]],
-            )
-        elif model_name == "siglip-so400m-patch14-384":
-            expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
-        elif model_name == "siglip-base-patch16-256-i18n":
-            expected_slice = torch.tensor(
-                [[-0.9064, 0.1073], [-0.0299, 0.5304]],
-            )
-
-        assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"nielsr/{model_name}")
-        processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="siglip-base-patch16-224",
-        type=str,
-        choices=model_name_to_checkpoint.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        help="Whether to verify logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
diff --git a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
deleted file mode 100644
index 874aa2e066f1..000000000000
--- a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-
-import fairseq
-import torch
-from torch import nn
-
-from transformers import (
-    MBart50Tokenizer,
-    MBartConfig,
-    MBartForCausalLM,
-    SpeechEncoderDecoderConfig,
-    SpeechEncoderDecoderModel,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Model,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-    adapter = hf_model.adapter
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        elif any(x in name for x in ["adaptor", "w2v_encoder.proj.", "w2v_proj_ln."]):
-            load_adapter(name, value, adapter, unused_weights)
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def load_adapter(full_name, value, adapter, unused_weights):
-    name = full_name.split("adaptor.")[-1]
-    items = name.split(".")
-
-    if items[1].isdigit():
-        layer_id = int(items[1])
-    else:
-        layer_id = None
-
-    if "adaptor" not in full_name:
-        if "proj_ln" in full_name:
-            # has to be layer norm
-            if "bias" in name:
-                assert (
-                    value.shape == adapter.proj_layer_norm.bias.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
-                adapter.proj_layer_norm.bias.data = value
-                logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
-            if "weight" in name:
-                assert (
-                    value.shape == adapter.proj_layer_norm.weight.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
-                adapter.proj_layer_norm.weight.data = value
-        else:
-            # has to be projection layer
-            if "bias" in name:
-                assert (
-                    value.shape == adapter.proj.bias.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
-                adapter.proj.bias.data = value
-                logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
-            if "weight" in name:
-                assert (
-                    value.shape == adapter.proj.weight.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
-                adapter.proj.weight.data = value
-                logger.info(f"Adapter proj layer weight was initialized from {full_name}.")
-    elif isinstance(layer_id, int):
-        if "bias" in name:
-            assert (
-                value.shape == adapter.layers[layer_id].conv.bias.data.shape
-            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
-            adapter.layers[layer_id].conv.bias.data = value
-            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
-        elif "weight" in name:
-            assert (
-                value.shape == adapter.layers[layer_id].conv.weight.data.shape
-            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
-            adapter.layers[layer_id].conv.weight.data = value
-            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    dict_path,
-    config_yaml_path,
-    encoder_config_path,
-    decoder_config_path,
-    add_adapter,
-    adapter_kernel_size,
-    adapter_stride,
-    decoder_start_token_id,
-    encoder_output_dim,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    # load configs
-    encoder_config = Wav2Vec2Config.from_pretrained(
-        encoder_config_path,
-        add_adapter=True,
-        adapter_stride=adapter_stride,
-        adapter_kernel_size=adapter_kernel_size,
-        token_token=True,
-        output_hidden_size=encoder_output_dim,
-    )
-    decoder_config = MBartConfig.from_pretrained(decoder_config_path)
-
-    # load model
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path],
-        arg_overrides={
-            "config_yaml": config_yaml_path,
-            "data": "/".join(dict_path.split("/")[:-1]),
-            "w2v_path": checkpoint_path,
-            "load_pretrained_decoder_from": None,
-        },
-    )
-    model = model[0].eval()
-
-    # load feature extractor
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)
-
-    # set weights for wav2vec2 encoder
-    hf_encoder = Wav2Vec2Model(encoder_config)
-
-    recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
-
-    # load decoder weights
-    hf_decoder = MBartForCausalLM(decoder_config)
-    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
-    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
-    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")
-
-    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
-    hf_wav2vec.config.tie_word_embeddings = False
-
-    tokenizer = MBart50Tokenizer(dict_path)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    config = hf_wav2vec.config.to_dict()
-    config["pad_token_id"] = tokenizer.pad_token_id
-    config["bos_token_id"] = tokenizer.bos_token_id
-    config["eos_token_id"] = tokenizer.eos_token_id
-    config["tokenizer_class"] = "mbart50"
-    config["feature_extractor_type"] = "wav2vec2"
-
-    config["decoder_start_token_id"] = tokenizer.eos_token_id
-    config["forced_bos_token_id"] = 250004
-    config["forced_eos_token_id"] = tokenizer.eos_token_id
-
-    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_yaml_path", default=None, type=str, help="Path to yaml file of fine-tuned model")
-    parser.add_argument(
-        "--encoder_config_path",
-        default="facebook/wav2vec2-xls-r-1b",
-        type=str,
-        help="Path to hf encoder wav2vec2 checkpoint config",
-    )
-    parser.add_argument(
-        "--decoder_config_path",
-        default="facebook/mbart-large-50-one-to-many-mmt",
-        type=str,
-        help="Path to hf decoder checkpoint config",
-    )
-    parser.add_argument("--add_adapter", default=True, type=bool, help="whethere to add model adapter layers")
-    parser.add_argument("--adapter_stride", default=2, type=int, help="stride of adapter layers")
-    parser.add_argument("--adapter_kernel_size", default=3, type=int, help="kernel size of adapter layers")
-    parser.add_argument("--encoder_output_dim", default=1024, type=int, help="encoder output dim")
-    parser.add_argument("--start_token_id", default=250004, type=int, help="`decoder_start_token_id` of model config")
-
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.dict_path,
-        args.config_yaml_path,
-        encoder_config_path=args.encoder_config_path,
-        decoder_config_path=args.decoder_config_path,
-        add_adapter=args.add_adapter,
-        adapter_kernel_size=args.adapter_kernel_size,
-        adapter_stride=args.adapter_stride,
-        decoder_start_token_id=args.start_token_id,
-        encoder_output_dim=args.encoder_output_dim,
-    )
diff --git a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
deleted file mode 100644
index 377288982087..000000000000
--- a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from torch import nn
-
-from transformers import (
-    Speech2Text2Config,
-    Speech2Text2ForCausalLM,
-    Speech2Text2Tokenizer,
-    SpeechEncoderDecoderConfig,
-    SpeechEncoderDecoderModel,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Model,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    # if encoder has different dim to decoder -> use proj_weight
-    proj_weight = None
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        elif name.split(".")[0] == "proj":
-            proj_weight = fairseq_model.proj
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-    return proj_weight
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def create_vocab_dict(dict_path):
-    with open(dict_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-        words = [line.split(" ")[0] for line in lines]
-
-    num_words = len(words)
-
-    vocab_dict = {
-        "<s>": 0,
-        "<pad>": 1,
-        "</s>": 2,
-        "<unk>": 3,
-    }
-
-    vocab_dict.update(dict(zip(words, range(4, num_words + 4))))
-    return vocab_dict
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    dict_path,
-    encoder_config_path,
-    decoder_config_path,
-    vocab_size,
-    num_decoder_layers,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path)
-    decoder_config = Speech2Text2Config.from_pretrained(
-        decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True
-    )
-
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=True,
-    )
-
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-    )
-    model = model[0].eval()
-
-    # set weights for wav2vec2 encoder
-    hf_encoder = Wav2Vec2Model(encoder_config)
-    projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
-
-    hf_decoder = Speech2Text2ForCausalLM(decoder_config)
-    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-    # set output linear layer
-    unexpected_keys.remove("embed_out")
-    hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach())
-
-    # layer norm is init to identity matrix so leaving it is fine
-    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
-    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")
-
-    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
-    hf_wav2vec.config.tie_word_embeddings = False
-
-    # add projection layer
-    hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight)
-    hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias)
-
-    vocab_dict = create_vocab_dict(dict_path)
-
-    with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp:
-        json.dump(vocab_dict, fp)
-
-    tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json"))
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    config = hf_wav2vec.config.to_dict()
-    config["pad_token_id"] = tokenizer.pad_token_id
-    config["bos_token_id"] = tokenizer.bos_token_id
-    config["eos_token_id"] = tokenizer.eos_token_id
-    config["tokenizer_class"] = "speech_to_text_2"
-    config["feature_extractor_type"] = "wav2vec2"
-
-    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument(
-        "--encoder_config_path",
-        default="facebook/wav2vec2-large-lv60",
-        type=str,
-        help="Path to hf encoder wav2vec2 checkpoint config",
-    )
-    parser.add_argument(
-        "--decoder_config_path",
-        default="facebook/s2t-small-mustc-en-fr-st",
-        type=str,
-        help="Path to hf decoder s2t checkpoint config",
-    )
-    parser.add_argument("--vocab_size", default=10224, type=int, help="Vocab size of decoder")
-    parser.add_argument("--num_decoder_layers", default=7, type=int, help="Number of decoder layers")
-
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.dict_path,
-        encoder_config_path=args.encoder_config_path,
-        decoder_config_path=args.decoder_config_path,
-        vocab_size=args.vocab_size,
-        num_decoder_layers=args.num_decoder_layers,
-    )
diff --git a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py b/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
deleted file mode 100644
index eb4d85262479..000000000000
--- a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import Speech2TextConfig, Speech2TextForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_keys(s_dict):
-    keys = list(s_dict.keys())
-    for key in keys:
-        if "transformer_layers" in key:
-            s_dict[key.replace("transformer_layers", "layers")] = s_dict.pop(key)
-        elif "subsample" in key:
-            s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path):
-    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
-    args = m2m_100["args"]
-    state_dict = m2m_100["model"]
-    lm_head_weights = state_dict["decoder.output_projection.weight"]
-
-    remove_ignore_keys_(state_dict)
-    rename_keys(state_dict)
-
-    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
-
-    tie_embeds = args.share_decoder_input_output_embed
-
-    conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
-    config = Speech2TextConfig(
-        vocab_size=vocab_size,
-        max_source_positions=args.max_source_positions,
-        max_target_positions=args.max_target_positions,
-        encoder_layers=args.encoder_layers,
-        decoder_layers=args.decoder_layers,
-        encoder_attention_heads=args.encoder_attention_heads,
-        decoder_attention_heads=args.decoder_attention_heads,
-        encoder_ffn_dim=args.encoder_ffn_embed_dim,
-        decoder_ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.encoder_embed_dim,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="relu",
-        num_conv_layers=len(conv_kernel_sizes),
-        conv_channels=args.conv_channels,
-        conv_kernel_sizes=conv_kernel_sizes,
-        input_feat_per_channel=args.input_feat_per_channel,
-        input_channels=args.input_channels,
-        tie_word_embeddings=tie_embeds,
-        num_beams=5,
-        max_length=200,
-        use_cache=True,
-        decoder_start_token_id=2,
-        early_stopping=True,
-    )
-
-    model = Speech2TextForConditionalGeneration(config)
-    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= {
-        "encoder.embed_positions.weights",
-        "decoder.embed_positions.weights",
-    }:
-        raise ValueError(
-            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
-            f" but all the following weights are missing {missing}"
-        )
-
-    if tie_embeds:
-        model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
-    else:
-        model.lm_head.weight.data = lm_head_weights
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/speecht5/convert_hifigan.py b/src/transformers/models/speecht5/convert_hifigan.py
deleted file mode 100644
index 4d78bb73af30..000000000000
--- a/src/transformers/models/speecht5/convert_hifigan.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SpeechT5 HiFi-GAN checkpoint."""
-
-import argparse
-
-import numpy as np
-import torch
-
-from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.speecht5")
-
-
-def load_weights(checkpoint, hf_model, config):
-    hf_model.apply_weight_norm()
-
-    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
-    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
-    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
-
-    for i in range(len(config.upsample_rates)):
-        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
-        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
-        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
-
-    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
-        for j in range(len(config.resblock_dilation_sizes)):
-            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
-
-            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
-
-    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
-    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
-    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
-
-    hf_model.remove_weight_norm()
-
-
-@torch.no_grad()
-def convert_hifigan_checkpoint(
-    checkpoint_path,
-    stats_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    if config_path is not None:
-        config = SpeechT5HifiGanConfig.from_pretrained(config_path)
-    else:
-        config = SpeechT5HifiGanConfig()
-
-    model = SpeechT5HifiGan(config)
-
-    orig_checkpoint = torch.load(checkpoint_path)
-    load_weights(orig_checkpoint["model"]["generator"], model, config)
-
-    stats = np.load(stats_path)
-    mean = stats[0].reshape(-1)
-    scale = stats[1].reshape(-1)
-    model.mean = torch.from_numpy(mean).float()
-    model.scale = torch.from_numpy(scale).float()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--stats_path", required=True, default=None, type=str, help="Path to stats.npy file")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_hifigan_checkpoint(
-        args.checkpoint_path,
-        args.stats_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 20dea800d9d1..000000000000
--- a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SpeechT5 checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    SpeechT5Config,
-    SpeechT5FeatureExtractor,
-    SpeechT5ForSpeechToSpeech,
-    SpeechT5ForSpeechToText,
-    SpeechT5ForTextToSpeech,
-    SpeechT5Processor,
-    SpeechT5Tokenizer,
-    logging,
-)
-from transformers.tokenization_utils import AddedToken
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.speecht5")
-
-MAPPING_SPEECH_ENCODER_PRENET = {
-    "speech_encoder_prenet.layer_norm": "speecht5.encoder.prenet.feature_projection.layer_norm",
-    "speech_encoder_prenet.post_extract_proj": "speecht5.encoder.prenet.feature_projection.projection",
-    "speech_encoder_prenet.pos_conv.0": "speecht5.encoder.prenet.pos_conv_embed.conv",
-    "speech_encoder_prenet.mask_emb": "speecht5.encoder.prenet.masked_spec_embed",
-}
-MAPPING_TEXT_ENCODER_PRENET = {
-    "text_encoder_prenet.encoder_prenet.0": "speecht5.encoder.prenet.embed_tokens",
-    "text_encoder_prenet.encoder_prenet.1.alpha": "speecht5.encoder.prenet.encode_positions.alpha",
-}
-MAPPING_SPEECH_DECODER_PRENET = {
-    "speech_decoder_prenet.decoder_prenet.0.0.prenet.0.0": "speecht5.decoder.prenet.layers.0",
-    "speech_decoder_prenet.decoder_prenet.0.0.prenet.1.0": "speecht5.decoder.prenet.layers.1",
-    "speech_decoder_prenet.decoder_prenet.0.1": "speecht5.decoder.prenet.final_layer",
-    "speech_decoder_prenet.decoder_prenet.1.alpha": "speecht5.decoder.prenet.encode_positions.alpha",
-    "speech_decoder_prenet.spkembs_layer.0": "speecht5.decoder.prenet.speaker_embeds_layer",
-}
-MAPPING_SPEECH_DECODER_POSTNET = {
-    "speech_decoder_postnet.feat_out": "speech_decoder_postnet.feat_out",
-    "speech_decoder_postnet.prob_out": "speech_decoder_postnet.prob_out",
-    "speech_decoder_postnet.postnet.postnet.0.0": "speech_decoder_postnet.layers.0.conv",
-    "speech_decoder_postnet.postnet.postnet.0.1": "speech_decoder_postnet.layers.0.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.1.0": "speech_decoder_postnet.layers.1.conv",
-    "speech_decoder_postnet.postnet.postnet.1.1": "speech_decoder_postnet.layers.1.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.2.0": "speech_decoder_postnet.layers.2.conv",
-    "speech_decoder_postnet.postnet.postnet.2.1": "speech_decoder_postnet.layers.2.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.3.0": "speech_decoder_postnet.layers.3.conv",
-    "speech_decoder_postnet.postnet.postnet.3.1": "speech_decoder_postnet.layers.3.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.4.0": "speech_decoder_postnet.layers.4.conv",
-    "speech_decoder_postnet.postnet.postnet.4.1": "speech_decoder_postnet.layers.4.batch_norm",
-}
-MAPPING_TEXT_DECODER_PRENET = {
-    "text_decoder_prenet.embed_tokens": "speecht5.decoder.prenet.embed_tokens",
-}
-MAPPING_TEXT_DECODER_POSTNET = {
-    "text_decoder_postnet.output_projection": "text_decoder_postnet.lm_head",
-}
-MAPPING_ENCODER = {
-    "encoder.layers.*.self_attn.k_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.k_proj",
-    "encoder.layers.*.self_attn.v_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.v_proj",
-    "encoder.layers.*.self_attn.q_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.q_proj",
-    "encoder.layers.*.self_attn.out_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.out_proj",
-    "encoder.layers.*.self_attn_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.layer_norm",
-    "encoder.layers.*.fc1": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.intermediate_dense",
-    "encoder.layers.*.fc2": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.output_dense",
-    "encoder.layers.*.final_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "speecht5.encoder.wrapped_encoder.layer_norm",
-    "encoder.pos_emb.pe_k": "speecht5.encoder.wrapped_encoder.embed_positions.pe_k",
-}
-MAPPING_DECODER = {
-    "decoder.layers.*.self_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.k_proj",
-    "decoder.layers.*.self_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.v_proj",
-    "decoder.layers.*.self_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.q_proj",
-    "decoder.layers.*.self_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.out_proj",
-    "decoder.layers.*.self_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.self_attn_layer_norm",
-    "decoder.layers.*.encoder_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.k_proj",
-    "decoder.layers.*.encoder_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.v_proj",
-    "decoder.layers.*.encoder_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.q_proj",
-    "decoder.layers.*.encoder_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.out_proj",
-    "decoder.layers.*.encoder_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn_layer_norm",
-    "decoder.layers.*.fc1": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.intermediate_dense",
-    "decoder.layers.*.fc2": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.output_dense",
-    "decoder.layers.*.final_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.final_layer_norm",
-}
-MAPPING_S2T = {
-    **MAPPING_SPEECH_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_TEXT_DECODER_PRENET,
-    **MAPPING_TEXT_DECODER_POSTNET,
-}
-MAPPING_T2S = {
-    **MAPPING_TEXT_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_SPEECH_DECODER_PRENET,
-    **MAPPING_SPEECH_DECODER_POSTNET,
-}
-MAPPING_S2S = {
-    **MAPPING_SPEECH_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_SPEECH_DECODER_PRENET,
-    **MAPPING_SPEECH_DECODER_POSTNET,
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = [
-    "encoder.version",
-    "encoder.layers.*.norm_k.weight",
-    "encoder.layers.*.norm_k.bias",
-    "decoder.version",
-    "decoder.layers.*.norm_k.weight",
-    "decoder.layers.*.norm_k.bias",
-    "decoder.pos_emb.pe_k",
-    "speech_encoder_prenet.embed_positions._float_tensor",
-    "text_decoder_prenet.embed_positions._float_tensor",
-]
-IGNORE_KEYS_S2T = IGNORE_KEYS + [
-    "encoder.proj",
-    "text_encoder_prenet.*",
-    "speech_decoder_prenet.*",
-    "speech_decoder_postnet.*",
-]
-IGNORE_KEYS_T2S = IGNORE_KEYS + [
-    "encoder.proj",
-    "speech_encoder_prenet.*",
-    "text_decoder_prenet.*",
-    "text_decoder_postnet.*",
-]
-IGNORE_KEYS_S2S = IGNORE_KEYS + [
-    "encoder.proj",
-    "text_encoder_prenet.*",
-    "text_decoder_prenet.*",
-    "text_decoder_postnet.*",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(fairseq_dict, hf_model, task):
-    unused_weights = []
-
-    if task == "s2t":
-        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
-        MAPPING = MAPPING_S2T
-        IGNORE_KEYS = IGNORE_KEYS_S2T
-    elif task == "t2s":
-        feature_encoder = None
-        MAPPING = MAPPING_T2S
-        IGNORE_KEYS = IGNORE_KEYS_T2S
-    elif task == "s2s":
-        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
-        MAPPING = MAPPING_S2S
-        IGNORE_KEYS = IGNORE_KEYS_S2S
-    else:
-        raise ValueError(f"Unsupported task: {task}")
-
-    for name, value in fairseq_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_encoder,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                # mapped_key = "speecht5." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-
-                if "*" in key:
-                    prefix, suffix = key.split(".*.")
-                    if prefix in name and suffix in name:
-                        key = suffix
-
-                # if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                if key in name:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_speecht5_checkpoint(
-    task,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    vocab_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = SpeechT5Config.from_pretrained(config_path)
-    else:
-        config = SpeechT5Config()
-
-    if task == "s2t":
-        config.max_length = config.max_text_positions
-        model = SpeechT5ForSpeechToText(config)
-    elif task == "t2s":
-        config.max_speech_positions = 1876
-        config.max_text_positions = 600
-        config.max_length = config.max_speech_positions
-        model = SpeechT5ForTextToSpeech(config)
-    elif task == "s2s":
-        config.max_speech_positions = 1876
-        config.max_length = config.max_speech_positions
-        model = SpeechT5ForSpeechToSpeech(config)
-    else:
-        raise ValueError(f"Unknown task name: {task}")
-
-    if vocab_path:
-        tokenizer = SpeechT5Tokenizer(vocab_path, model_max_length=config.max_text_positions)
-
-        # Mask token behaves like a normal word, i.e. include the space before it
-        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
-        tokenizer.mask_token = mask_token
-        tokenizer.add_special_tokens({"mask_token": mask_token})
-        tokenizer.add_tokens(["<ctc_blank>"])
-
-    feature_extractor = SpeechT5FeatureExtractor()
-    processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    fairseq_checkpoint = torch.load(checkpoint_path)
-    recursively_load_weights(fairseq_checkpoint["model"], model, task)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--task",
-        default="s2t",
-        type=str,
-        help="Type of the SpeechT5 model you'd like to convert. Should be one of 's2t', 't2s', 's2s'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--vocab_path", default=None, type=str, help="Path to SentencePiece model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_speecht5_checkpoint(
-        args.task,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.vocab_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py b/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py
deleted file mode 100644
index 18755bf4fe01..000000000000
--- a/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SuperPointConfig, SuperPointForKeypointDetection, SuperPointImageProcessor
-
-
-def get_superpoint_config():
-    config = SuperPointConfig(
-        encoder_hidden_sizes=[64, 64, 128, 128],
-        decoder_hidden_size=256,
-        keypoint_decoder_dim=65,
-        descriptor_decoder_dim=256,
-        keypoint_threshold=0.005,
-        max_keypoints=-1,
-        nms_radius=4,
-        border_removal_distance=4,
-        initializer_range=0.02,
-    )
-
-    return config
-
-
-def create_rename_keys(config, state_dict):
-    rename_keys = []
-
-    # Encoder weights
-    rename_keys.append(("conv1a.weight", "encoder.conv_blocks.0.conv_a.weight"))
-    rename_keys.append(("conv1b.weight", "encoder.conv_blocks.0.conv_b.weight"))
-    rename_keys.append(("conv2a.weight", "encoder.conv_blocks.1.conv_a.weight"))
-    rename_keys.append(("conv2b.weight", "encoder.conv_blocks.1.conv_b.weight"))
-    rename_keys.append(("conv3a.weight", "encoder.conv_blocks.2.conv_a.weight"))
-    rename_keys.append(("conv3b.weight", "encoder.conv_blocks.2.conv_b.weight"))
-    rename_keys.append(("conv4a.weight", "encoder.conv_blocks.3.conv_a.weight"))
-    rename_keys.append(("conv4b.weight", "encoder.conv_blocks.3.conv_b.weight"))
-    rename_keys.append(("conv1a.bias", "encoder.conv_blocks.0.conv_a.bias"))
-    rename_keys.append(("conv1b.bias", "encoder.conv_blocks.0.conv_b.bias"))
-    rename_keys.append(("conv2a.bias", "encoder.conv_blocks.1.conv_a.bias"))
-    rename_keys.append(("conv2b.bias", "encoder.conv_blocks.1.conv_b.bias"))
-    rename_keys.append(("conv3a.bias", "encoder.conv_blocks.2.conv_a.bias"))
-    rename_keys.append(("conv3b.bias", "encoder.conv_blocks.2.conv_b.bias"))
-    rename_keys.append(("conv4a.bias", "encoder.conv_blocks.3.conv_a.bias"))
-    rename_keys.append(("conv4b.bias", "encoder.conv_blocks.3.conv_b.bias"))
-
-    # Keypoint Decoder weights
-    rename_keys.append(("convPa.weight", "keypoint_decoder.conv_score_a.weight"))
-    rename_keys.append(("convPb.weight", "keypoint_decoder.conv_score_b.weight"))
-    rename_keys.append(("convPa.bias", "keypoint_decoder.conv_score_a.bias"))
-    rename_keys.append(("convPb.bias", "keypoint_decoder.conv_score_b.bias"))
-
-    # Descriptor Decoder weights
-    rename_keys.append(("convDa.weight", "descriptor_decoder.conv_descriptor_a.weight"))
-    rename_keys.append(("convDb.weight", "descriptor_decoder.conv_descriptor_b.weight"))
-    rename_keys.append(("convDa.bias", "descriptor_decoder.conv_descriptor_a.bias"))
-    rename_keys.append(("convDb.bias", "descriptor_decoder.conv_descriptor_b.bias"))
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def prepare_imgs():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im1 = Image.open(requests.get(url, stream=True).raw)
-    url = "http://images.cocodataset.org/test-stuff2017/000000004016.jpg"
-    im2 = Image.open(requests.get(url, stream=True).raw)
-    return [im1, im2]
-
-
-@torch.no_grad()
-def convert_superpoint_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub, test_mode=False):
-    """
-    Copy/paste/tweak model's weights to our SuperPoint structure.
-    """
-
-    print("Downloading original model from checkpoint...")
-    config = get_superpoint_config()
-
-    # load original state_dict from URL
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
-
-    print("Converting model parameters...")
-    # rename keys
-    rename_keys = create_rename_keys(config, original_state_dict)
-    new_state_dict = original_state_dict.copy()
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HuggingFace model
-    model = SuperPointForKeypointDetection(config)
-    model.load_state_dict(new_state_dict)
-    model.eval()
-    print("Successfully loaded weights in the model")
-
-    # Check model outputs
-    preprocessor = SuperPointImageProcessor()
-    inputs = preprocessor(images=prepare_imgs(), return_tensors="pt")
-    outputs = model(**inputs)
-
-    # If test_mode is True, we check that the model outputs match the original results
-    if test_mode:
-        torch.count_nonzero(outputs.mask[0])
-        expected_keypoints_shape = (2, 830, 2)
-        expected_scores_shape = (2, 830)
-        expected_descriptors_shape = (2, 830, 256)
-
-        expected_keypoints_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]])
-        expected_scores_values = torch.tensor([0.0064, 0.0140, 0.0595, 0.0728, 0.5170, 0.0175, 0.1523, 0.2055, 0.0336])
-        expected_descriptors_value = torch.tensor(-0.1096)
-        assert outputs.keypoints.shape == expected_keypoints_shape
-        assert outputs.scores.shape == expected_scores_shape
-        assert outputs.descriptors.shape == expected_descriptors_shape
-
-        assert torch.allclose(outputs.keypoints[0, :3], expected_keypoints_values, atol=1e-3)
-        assert torch.allclose(outputs.scores[0, :9], expected_scores_values, atol=1e-3)
-        assert torch.allclose(outputs.descriptors[0, 0, 0], expected_descriptors_value, atol=1e-3)
-        print("Model outputs match the original results!")
-
-    if save_model:
-        print("Saving model to local...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-        model_name = "superpoint"
-        if push_to_hub:
-            print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(model_name)
-        preprocessor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/magicleap/SuperPointPretrainedNetwork/raw/master/superpoint_v1.pth",
-        type=str,
-        help="URL of the original SuperPoint checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_superpoint_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
-    )
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
deleted file mode 100644
index 21ecebebe241..000000000000
--- a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SwiftFormer checkpoints from the original implementation."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SwiftFormerConfig,
-    SwiftFormerForImageClassification,
-    ViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-device = torch.device("cpu")
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_expected_output(swiftformer_name):
-    if swiftformer_name == "swiftformer_xs":
-        return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
-
-    elif swiftformer_name == "swiftformer_s":
-        return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
-
-    elif swiftformer_name == "swiftformer_l1":
-        return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
-
-    elif swiftformer_name == "swiftformer_l3":
-        return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def create_rename_keys(state_dict):
-    rename_keys = []
-    for k in state_dict.keys():
-        k_new = k
-        if ".pwconv" in k:
-            k_new = k_new.replace(".pwconv", ".point_wise_conv")
-        if ".dwconv" in k:
-            k_new = k_new.replace(".dwconv", ".depth_wise_conv")
-        if ".Proj." in k:
-            k_new = k_new.replace(".Proj.", ".proj.")
-        if "patch_embed" in k_new:
-            k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
-        if "network" in k_new:
-            ls = k_new.split(".")
-            if ls[2].isdigit():
-                k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
-            else:
-                k_new = k_new.replace("network", "swiftformer.encoder.network")
-        rename_keys.append((k, k_new))
-    return rename_keys
-
-
-@torch.no_grad()
-def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
-    """
-    Copy/paste/tweak model's weights to our SwiftFormer structure.
-    """
-
-    # define default SwiftFormer configuration
-    config = SwiftFormerConfig()
-
-    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
-    config.num_labels = 1000
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # size of the architecture
-    if swiftformer_name == "swiftformer_xs":
-        config.depths = [3, 3, 6, 4]
-        config.embed_dims = [48, 56, 112, 220]
-
-    elif swiftformer_name == "swiftformer_s":
-        config.depths = [3, 3, 9, 6]
-        config.embed_dims = [48, 64, 168, 224]
-
-    elif swiftformer_name == "swiftformer_l1":
-        config.depths = [4, 3, 10, 5]
-        config.embed_dims = [48, 96, 192, 384]
-
-    elif swiftformer_name == "swiftformer_l3":
-        config.depths = [4, 4, 12, 6]
-        config.embed_dims = [64, 128, 320, 512]
-
-    # load state_dict of original model, remove and rename some keys
-    if original_ckpt:
-        if original_ckpt.startswith("https"):
-            checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
-        else:
-            checkpoint = torch.load(original_ckpt, map_location="cpu")
-    state_dict = checkpoint
-
-    rename_keys = create_rename_keys(state_dict)
-    for rename_key_src, rename_key_dest in rename_keys:
-        rename_key(state_dict, rename_key_src, rename_key_dest)
-
-    # load HuggingFace model
-    hf_model = SwiftFormerForImageClassification(config).eval()
-    hf_model.load_state_dict(state_dict)
-
-    # prepare test inputs
-    image = prepare_img()
-    processor = ViTImageProcessor.from_pretrained("preprocessor_config")
-    inputs = processor(images=image, return_tensors="pt")
-
-    # compare outputs from both models
-    timm_logits = get_expected_output(swiftformer_name)
-    hf_logits = hf_model(inputs["pixel_values"]).logits
-
-    assert hf_logits.shape == torch.Size([1, 1000])
-    assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swiftformer_name",
-        default="swiftformer_xs",
-        choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
-        type=str,
-        help="Name of the SwiftFormer model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="./converted_outputs/",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
-
-    args = parser.parse_args()
-    convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
deleted file mode 100644
index 6402346289c1..000000000000
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin SimMIM checkpoints from the original repository.
-
-URL: https://github.com/microsoft/Swin-Transformer/blob/main/MODELHUB.md#simmim-pretrained-swin-v1-models"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SwinConfig, SwinForMaskedImageModeling, ViTImageProcessor
-
-
-def get_swin_config(model_name):
-    config = SwinConfig(image_size=192)
-
-    if "base" in model_name:
-        window_size = 6
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    elif "large" in model_name:
-        window_size = 12
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-    else:
-        raise ValueError("Model not supported, only supports base and large variants")
-
-    config.window_size = window_size
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-
-    return config
-
-
-def rename_key(name):
-    if "encoder.mask_token" in name:
-        name = name.replace("encoder.mask_token", "embeddings.mask_token")
-    if "encoder.patch_embed.proj" in name:
-        name = name.replace("encoder.patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "encoder.patch_embed.norm" in name:
-        name = name.replace("encoder.patch_embed.norm", "embeddings.norm")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-
-    if name == "encoder.norm.weight":
-        name = "layernorm.weight"
-    if name == "encoder.norm.bias":
-        name = "layernorm.bias"
-
-    if "decoder" in name:
-        pass
-    else:
-        name = "swin." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "attn_mask" in key:
-            pass
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[2])
-            block_num = int(key_split[4])
-            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
-                    val[:dim, :]
-                )
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
-                    val[-dim:, :]
-                )
-            else:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
-                    :dim
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = val[
-                    -dim:
-                ]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    config = get_swin_config(model_name)
-    model = SwinForMaskedImageModeling(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = ViTImageProcessor(size={"height": 192, "width": 192})
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs).logits
-
-    print(outputs.keys())
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"microsoft/{model_name}")
-        image_processor.push_to_hub(f"microsoft/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="swin-base-simmim-window6-192",
-        type=str,
-        choices=["swin-base-simmim-window6-192", "swin-large-simmim-window12-192"],
-        help="Name of the Swin SimMIM model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/SwinSimMIM/simmim_pretrain__swin_base__img192_window6__100ep.pth",
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_swin_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
deleted file mode 100644
index c91249b272ba..000000000000
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import argparse
-import json
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import AutoImageProcessor, SwinConfig, SwinForImageClassification
-
-
-def get_swin_config(swin_name):
-    config = SwinConfig()
-    name_split = swin_name.split("_")
-
-    model_size = name_split[1]
-    img_size = int(name_split[4])
-    window_size = int(name_split[3][-1])
-
-    if model_size == "tiny":
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "small":
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "base":
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    else:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-
-    if "in22k" in swin_name:
-        num_classes = 21841
-    else:
-        num_classes = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    config.image_size = img_size
-    config.num_labels = num_classes
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-    config.window_size = window_size
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "swin." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "mask" in key:
-            continue
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            block_num = int(key_split[3])
-            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
-                    val[:dim, :]
-                )
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
-                    val[-dim:, :]
-                )
-            else:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
-                    :dim
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = val[
-                    -dim:
-                ]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
-    timm_model = timm.create_model(swin_name, pretrained=True)
-    timm_model.eval()
-
-    config = get_swin_config(swin_name)
-    model = SwinForImageClassification(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    timm_outs = timm_model(inputs["pixel_values"])
-    hf_outs = model(**inputs).logits
-
-    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
-
-    print(f"Saving model {swin_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swin_name",
-        default="swin_tiny_patch4_window7_224",
-        type=str,
-        help="Name of the Swin timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_swin_checkpoint(args.swin_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
deleted file mode 100644
index f0531283395e..000000000000
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin2SR checkpoints from the original repository. URL: https://github.com/mv-lab/swin2sr"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-from torchvision.transforms import Compose, Normalize, Resize, ToTensor
-
-from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor
-
-
-def get_config(checkpoint_url):
-    config = Swin2SRConfig()
-
-    if "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
-        config.upscale = 4
-    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
-        config.upscale = 4
-        config.image_size = 48
-        config.upsampler = "pixelshuffle_aux"
-    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
-        config.depths = [6, 6, 6, 6]
-        config.embed_dim = 60
-        config.num_heads = [6, 6, 6, 6]
-        config.upsampler = "pixelshuffledirect"
-    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
-        config.upscale = 4
-        config.upsampler = "nearest+conv"
-    elif "Swin2SR_Jpeg_dynamic" in checkpoint_url:
-        config.num_channels = 1
-        config.upscale = 1
-        config.image_size = 126
-        config.window_size = 7
-        config.img_range = 255.0
-        config.upsampler = ""
-
-    return config
-
-
-def rename_key(name, config):
-    if "patch_embed.proj" in name and "layers" not in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.patch_embeddings.layernorm")
-    if "layers" in name:
-        name = name.replace("layers", "encoder.stages")
-    if "residual_group.blocks" in name:
-        name = name.replace("residual_group.blocks", "layers")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "q_bias" in name:
-        name = name.replace("q_bias", "query.bias")
-    if "k_bias" in name:
-        name = name.replace("k_bias", "key.bias")
-    if "v_bias" in name:
-        name = name.replace("v_bias", "value.bias")
-    if "cpb_mlp" in name:
-        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "patch_embed.projection")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "conv_first" in name:
-        name = name.replace("conv_first", "first_convolution")
-
-    if (
-        "upsample" in name
-        or "conv_before_upsample" in name
-        or "conv_bicubic" in name
-        or "conv_up" in name
-        or "conv_hr" in name
-        or "conv_last" in name
-        or "aux" in name
-    ):
-        # heads
-        if "conv_last" in name:
-            name = name.replace("conv_last", "final_convolution")
-        if config.upsampler in ["pixelshuffle", "pixelshuffle_aux", "nearest+conv"]:
-            if "conv_before_upsample.0" in name:
-                name = name.replace("conv_before_upsample.0", "conv_before_upsample")
-            if "upsample.0" in name:
-                name = name.replace("upsample.0", "upsample.convolution_0")
-            if "upsample.2" in name:
-                name = name.replace("upsample.2", "upsample.convolution_1")
-            name = "upsample." + name
-        elif config.upsampler == "pixelshuffledirect":
-            name = name.replace("upsample.0.weight", "upsample.conv.weight")
-            name = name.replace("upsample.0.bias", "upsample.conv.bias")
-        else:
-            pass
-    else:
-        name = "swin2sr." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            stage_num = int(key_split[1])
-            block_num = int(key_split[4])
-            dim = config.embed_dim
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-            pass
-        else:
-            orig_state_dict[rename_key(key, config)] = val
-
-    return orig_state_dict
-
-
-def convert_swin2sr_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    config = get_config(checkpoint_url)
-    model = Swin2SRForImageSuperResolution(config)
-    model.eval()
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    new_state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-
-    if len(missing_keys) > 0:
-        raise ValueError("Missing keys when converting: {}".format(missing_keys))
-    for key in unexpected_keys:
-        if not ("relative_position_index" in key or "relative_coords_table" in key or "self_mask" in key):
-            raise ValueError(f"Unexpected key {key} in state_dict")
-
-    # verify values
-    url = "https://github.com/mv-lab/swin2sr/blob/main/testsets/real-inputs/shanghai.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    processor = Swin2SRImageProcessor()
-    # pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    image_size = 126 if "Jpeg" in checkpoint_url else 256
-    transforms = Compose(
-        [
-            Resize((image_size, image_size)),
-            ToTensor(),
-            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    pixel_values = transforms(image).unsqueeze(0)
-
-    if config.num_channels == 1:
-        pixel_values = pixel_values[:, 0, :, :].unsqueeze(1)
-
-    outputs = model(pixel_values)
-
-    # assert values
-    if "Swin2SR_ClassicalSR_X2_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 512, 512])
-        expected_slice = torch.tensor(
-            [[-0.7087, -0.7138, -0.6721], [-0.8340, -0.8095, -0.7298], [-0.9149, -0.8414, -0.7940]]
-        )
-    elif "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.7775, -0.8105, -0.8933], [-0.7764, -0.8356, -0.9225], [-0.7976, -0.8686, -0.9579]]
-        )
-    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
-        # TODO values didn't match exactly here
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.8035, -0.7504, -0.7491], [-0.8538, -0.8124, -0.7782], [-0.8804, -0.8651, -0.8493]]
-        )
-    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 512, 512])
-        expected_slice = torch.tensor(
-            [[-0.7669, -0.8662, -0.8767], [-0.8810, -0.9962, -0.9820], [-0.9340, -1.0322, -1.1149]]
-        )
-    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
-        )
-
-    assert (
-        outputs.reconstruction.shape == expected_shape
-    ), f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"
-    assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
-    print("Looks ok!")
-
-    url_to_name = {
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth": (
-            "swin2SR-classical-sr-x2-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X4_64.pth": (
-            "swin2SR-classical-sr-x4-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_CompressedSR_X4_48.pth": (
-            "swin2SR-compressed-sr-x4-48"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_Lightweight_X2_64.pth": (
-            "swin2SR-lightweight-x2-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR.pth": (
-            "swin2SR-realworld-sr-x4-64-bsrgan-psnr"
-        ),
-    }
-    model_name = url_to_name[checkpoint_url]
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"caidas/{model_name}")
-        processor.push_to_hub(f"caidas/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth",
-        type=str,
-        help="URL of the original Swin2SR checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the converted model to the hub.")
-
-    args = parser.parse_args()
-    convert_swin2sr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
deleted file mode 100644
index 0e6e837a7e7e..000000000000
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swinv2 checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification
-
-
-def get_swinv2_config(swinv2_name):
-    config = Swinv2Config()
-    name_split = swinv2_name.split("_")
-
-    model_size = name_split[1]
-    if "to" in name_split[3]:
-        img_size = int(name_split[3][-3:])
-    else:
-        img_size = int(name_split[3])
-    if "to" in name_split[2]:
-        window_size = int(name_split[2][-2:])
-    else:
-        window_size = int(name_split[2][6:])
-
-    if model_size == "tiny":
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "small":
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "base":
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    else:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-
-    if "to" in swinv2_name:
-        config.pretrained_window_sizes = (12, 12, 12, 6)
-
-    if ("22k" in swinv2_name) and ("to" not in swinv2_name):
-        num_classes = 21841
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-22k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    else:
-        num_classes = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    config.image_size = img_size
-    config.num_labels = num_classes
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-    config.window_size = window_size
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "q_bias" in name:
-        name = name.replace("q_bias", "query.bias")
-    if "k_bias" in name:
-        name = name.replace("k_bias", "key.bias")
-    if "v_bias" in name:
-        name = name.replace("v_bias", "value.bias")
-    if "cpb_mlp" in name:
-        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "swinv2." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "mask" in key:
-            continue
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            block_num = int(key_split[3])
-            dim = model.swinv2.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
-    timm_model = timm.create_model(swinv2_name, pretrained=True)
-    timm_model.eval()
-
-    config = get_swinv2_config(swinv2_name)
-    model = Swinv2ForImageClassification(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    timm_outs = timm_model(inputs["pixel_values"])
-    hf_outs = model(**inputs).logits
-
-    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
-
-    print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    model.push_to_hub(
-        repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
-        organization="nandwalritik",
-        commit_message="Add model",
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swinv2_name",
-        default="swinv2_tiny_patch4_window8_256",
-        type=str,
-        help="Name of the Swinv2 timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_swinv2_checkpoint(args.swinv2_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
deleted file mode 100644
index e4b8af07cd4c..000000000000
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import argparse
-import json
-import os
-
-import tensorstore as ts
-import torch
-from flax import serialization
-from flax.traverse_util import flatten_dict, unflatten_dict
-from tensorflow.io import gfile
-
-from transformers.modeling_utils import dtype_byte_size
-from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import (
-    rename_keys,
-)
-from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-from transformers.utils.hub import convert_file_size_to_int
-
-
-def rename_base_flax_keys(flax_key_tuple, flax_tensor):
-    """
-    Post renaming of basic JAX keys to pytorch.
-    """
-    if flax_key_tuple[-1] == "kernel" and flax_tensor.ndim == 3:
-        # expert layer
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-        flax_tensor = torch.permute(flax_tensor, (0, 2, 1))
-    elif flax_key_tuple[-1] == "kernel" and ".".join(flax_key_tuple):
-        # linear layer
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-        flax_tensor = flax_tensor.T
-    elif flax_key_tuple[-1] in ["scale", "embedding"]:
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-
-    return flax_key_tuple, flax_tensor
-
-
-def get_key_and_tensorstore_dict(layer, checkpoint_info, switch_checkpoint_path):
-    if "metadata" in layer:
-        split_layer = layer.split("metadata")
-        curr_real_layer_name = "".join(split_layer[0])[:-1]
-        split_layer = [tuple(("metadata" + split_layer[1]).split("/"))]
-    elif "kvstore" in layer:
-        split_layer = layer.split("kvstore")
-        curr_real_layer_name = "".join(split_layer[0])[:-1]
-        split_layer = [tuple(("kvstore" + split_layer[1]).split("/"))]
-
-    else:
-        split_layer = layer.split("/")
-        curr_real_layer_name = "/".join(split_layer[:-1])
-        split_layer[-1] = (split_layer[-1],)
-
-    if "kvstore/path" in layer:
-        content = f"{switch_checkpoint_path}/{checkpoint_info[layer]}"
-    elif "kvstore/driver" in layer:
-        content = "file"
-    else:
-        content = checkpoint_info[layer]
-
-    return curr_real_layer_name, split_layer, content
-
-
-def rename_and_save_block(current_block, save_path):
-    current_block = rename_keys(current_block)
-    new_current_block = {}
-    for k, v in current_block.items():
-        new_current_block[k.replace("/", ".")] = v
-    current_block = new_current_block
-    torch.save(current_block, save_path)
-
-
-def shard_on_the_fly(switch_checkpoint_path, dump_path, max_shard_size, dtype, weights_name: str = WEIGHTS_NAME):
-    max_shard_size = convert_file_size_to_int(max_shard_size)
-    sharded_state_dicts = []
-    current_block = {}
-    current_block_size = 0
-    total_size = 0
-
-    os.makedirs(dump_path, exist_ok=True)
-    with gfile.GFile(switch_checkpoint_path + "/checkpoint", "rb") as fp:
-        checkpoint_info = serialization.msgpack_restore(fp.read())["optimizer"]["target"]
-        checkpoint_info = flatten_dict(checkpoint_info, sep="/")
-
-    all_layers = {}
-    for layer in checkpoint_info.keys():
-        curr_real_layer_name, split_layer, content = get_key_and_tensorstore_dict(
-            layer, checkpoint_info, switch_checkpoint_path
-        )
-        if curr_real_layer_name in all_layers:
-            all_layers[curr_real_layer_name][split_layer[-1]] = content
-        else:
-            all_layers[curr_real_layer_name] = {split_layer[-1]: content}
-
-    for key in all_layers.keys():
-        # open tensorstore file
-        raw_weights = ts.open(unflatten_dict(all_layers[key])).result().read().result()
-        raw_weights = torch.tensor(raw_weights)
-        weight_size = raw_weights.numel() * dtype_byte_size(raw_weights.dtype)
-
-        # use the renaming pattern from the small conversion scripts
-        key, raw_weights = rename_base_flax_keys(tuple(key.split("/")), raw_weights)
-        key = "/".join(key)
-
-        # If this weight is going to tip up over the maximal size, we split.
-        if current_block_size + weight_size > max_shard_size:
-            save_path = os.path.join(
-                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
-            )
-            rename_and_save_block(current_block, save_path)
-            sharded_state_dicts.append(current_block.keys())
-            del current_block
-            current_block = {}
-            current_block_size = 0
-
-        current_block[key] = raw_weights.to(getattr(torch, dtype))
-        current_block_size += weight_size
-        total_size += weight_size
-
-    # Add the last block
-    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
-    rename_and_save_block(current_block, save_path)
-    sharded_state_dicts.append(current_block.keys())
-
-    # If we only have one shard, we return it
-    if len(sharded_state_dicts) == 1:
-        return {weights_name: sharded_state_dicts[0]}, None
-
-    # Otherwise, let's build the index
-    weight_map = {}
-    shards = {}
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(
-            ".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin"
-        )  # len(sharded_state_dicts):05d}
-        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
-        os.rename(temp_filename, os.path.join(dump_path, shard_file))
-        shards[shard_file] = shard
-        for key in shard:
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-
-    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    return metadata, index
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--switch_t5x_checkpoint_path",
-        default="/mnt/disks/disk_switch/original_checkpoints/switch-xxl-128/checkpoint_634600",
-        type=str,
-        required=False,
-        help="Path to a directory containing a folder per layer. Follows the original Google format.",
-    )
-    parser.add_argument("--max_shard_size", default="10GB", required=False, help="Max shard size")
-    parser.add_argument("--dtype", default="bfloat16", type=str, required=False, help="dtype of the saved model")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/mnt/disks/disk_switch/original_checkpoints/switch-xxl-128-converted",
-        type=str,
-        required=False,
-        help="Path to the output pytorch model.",
-    )
-    args = parser.parse_args()
-    shard_on_the_fly(
-        args.switch_t5x_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.max_shard_size,
-        args.dtype,
-    )
-
-
-def sanity_check():
-    from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration, T5Tokenizer
-
-    config = SwitchTransformersConfig.from_pretrained("google/switch-base-8")
-    config.save_pretrained("/home/arthur_huggingface_co/transformers/switch_converted")
-    model = SwitchTransformersForConditionalGeneration.from_pretrained(
-        "/home/arthur_huggingface_co/transformers/switch_converted", device_map="auto"
-    )
-
-    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-    text = "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
-
-    input_ids = tokenizer(text, return_tensors="pt").input_ids
-    out = model.generate(input_ids, decoder_start_token_id=0)
-    print(tokenizer.decode(out[0]))
diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
deleted file mode 100644
index 5937101169c6..000000000000
--- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert SwitchTransformersX checkpoints from the original repository to JAX/FLAX model."""
-
-import argparse
-import re
-
-from flax.traverse_util import flatten_dict, unflatten_dict
-from t5x import checkpoints
-
-from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration
-from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-# should not include what is already done by the `from_pt` argument
-MOE_LAYER_NAME_MAPPING = {
-    "/attention/": "/0/SelfAttention/",
-    "/self_attention/": "/0/SelfAttention/",
-    "/encoder_decoder_attention/": "/1/EncDecAttention/",
-    "value": "v",
-    "query": "q",
-    "key": "k",
-    "out": "o",
-    "pre_self_attention_layer_norm": "0/layer_norm",
-    "pre_cross_attention_layer_norm": "1/layer_norm",
-    "pre_attention_layer_norm": "0/layer_norm",  # previously 1, but seems wrong
-    "token_embedder": "shared",
-    "encoder_norm": "final_layer_norm",
-    "decoder_norm": "final_layer_norm",
-    "relpos_bias/rel_embedding": "block/0/layer/0/SelfAttention/relative_attention_bias/weight",
-    "router/router_weights/w/": "router/classifier/",
-    "roer/roer_weights/w/": "router/classifier/",
-    "logits_dense": "lm_head",
-}
-
-
-def rename_keys(s_dict):
-    # 1. in HF T5, we have block.{x}.layer.{y}. which corresponds to layer.{x} in
-    # the original model
-    keys = list(s_dict.keys())
-    for key in keys:
-        layer_to_block_of_layer = r".*/layers_(\d+)"
-        new_key = key
-        if re.match(layer_to_block_of_layer, key):
-            new_key = re.sub(r"layers_(\d+)", r"block/\1/layer", new_key)
-
-        layer_to_block_of_layer = r"(encoder|decoder)\/"
-
-        if re.match(layer_to_block_of_layer, key):
-            groups = re.match(layer_to_block_of_layer, new_key).groups()
-            if groups[0] == "encoder":
-                new_key = re.sub(r"/mlp/", r"/1/mlp/", new_key)
-                new_key = re.sub(r"/pre_mlp_layer_norm/", r"/1/layer_norm/", new_key)
-
-            elif groups[0] == "decoder":
-                new_key = re.sub(r"/mlp/", r"/2/mlp/", new_key)
-                new_key = re.sub(r"/pre_mlp_layer_norm/", r"/2/layer_norm/", new_key)
-
-        # 2. Convert other classic mappings
-        for old_key, temp_key in MOE_LAYER_NAME_MAPPING.items():
-            if old_key in new_key:
-                new_key = new_key.replace(old_key, temp_key)
-
-        print(f"{key} -> {new_key}")
-        s_dict[new_key] = s_dict.pop(key)
-
-    if "encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight" in s_dict:
-        s_dict["encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"] = s_dict[
-            "encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"
-        ].T
-    if "decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight" in s_dict:
-        s_dict["decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"] = s_dict[
-            "decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"
-        ].T
-
-    # 3. Take extra care of the EXPERTS layer
-    for key in list(s_dict.keys()):
-        if "expert" in key:
-            num_experts = s_dict[key].shape[0]
-            expert_weihts = s_dict[key]
-            for idx in range(num_experts):
-                s_dict[key.replace("expert/", f"experts/expert_{idx}/")] = expert_weihts[idx]
-                print(f"{key} -> {key.replace('expert/', f'experts/expert_{idx}/')}")
-
-            s_dict.pop(key)
-
-    return s_dict
-
-
-GIN_TO_CONFIG_MAPPING = {
-    "NUM_ENCODER_LAYERS": "num_layers",
-    "NUM_DECODER_LAYERS": "num_decoder_layers",
-    "NUM_HEADS": "num_heads",
-    "HEAD_DIM": "d_kv",
-    "EMBED_DIM": "d_model",
-    "MLP_DIM": "d_ff",
-    "NUM_SELECTED_EXPERTS": "num_selected_experts",
-    "NUM_ENCODER_SPARSE_LAYERS": "num_sparse_encoder_layers",
-    "NUM_DECODER_SPARSE_LAYERS": "num_sparse_decoder_layers",
-    "dense.MlpBlock.activations": "feed_forward_proj",
-}
-
-
-def convert_gin_to_config(gin_file, num_experts):
-    # Convert a google style config to the hugging face fromat
-    import regex as re
-
-    with open(gin_file, "r") as f:
-        raw_gin = f.read()
-
-    regex_match = re.findall(r"(.*) = ([0-9.]*)", raw_gin)
-    args = {}
-    for param, value in regex_match:
-        if param in GIN_TO_CONFIG_MAPPING and value != "":
-            args[GIN_TO_CONFIG_MAPPING[param]] = float(value) if "." in value else int(value)
-
-    activation = re.findall(r"(.*activations) = \(\'(.*)\',\)", raw_gin)[0]
-    args[GIN_TO_CONFIG_MAPPING[activation[0]]] = str(activation[1])
-
-    args["num_experts"] = num_experts
-    config = SwitchTransformersConfig(**args)
-    return config
-
-
-def convert_flax_checkpoint_to_pytorch(
-    flax_checkpoint_path, config_file, gin_file=None, pytorch_dump_path="./", num_experts=8
-):
-    # Initialise PyTorch model
-
-    print(f"Loading flax weights from : {flax_checkpoint_path}")
-    flax_params = checkpoints.load_t5x_checkpoint(flax_checkpoint_path)
-
-    if gin_file is not None:
-        config = convert_gin_to_config(gin_file, num_experts)
-    else:
-        config = SwitchTransformersConfig.from_pretrained(config_file)
-
-    pt_model = SwitchTransformersForConditionalGeneration(config)
-
-    flax_params = flax_params["target"]
-    flax_params = flatten_dict(flax_params, sep="/")
-    flax_params = rename_keys(flax_params)
-    flax_params = unflatten_dict(flax_params, sep="/")
-
-    # Load the flax params in the PT model
-    load_flax_weights_in_pytorch_model(pt_model, flax_params)
-
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    pt_model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--switch_t5x_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained SwitchTransformers model. \nThis specifies the"
-            " model architecture. If not provided, a `gin_file` has to be provided."
-        ),
-    )
-    parser.add_argument(
-        "--gin_file",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the gin config file. If not provided, a `config_file` has to be passed   ",
-    )
-    parser.add_argument(
-        "--config_name", default=None, type=str, required=False, help="Config name of SwitchTransformers model."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output pytorch model."
-    )
-    parser.add_argument("--num_experts", default=8, type=int, required=False, help="Number of experts")
-    args = parser.parse_args()
-    convert_flax_checkpoint_to_pytorch(
-        args.switch_t5x_checkpoint_path,
-        args.config_name,
-        args.gin_file,
-        args.pytorch_dump_folder_path,
-        args.num_experts,
-    )
diff --git a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 9b1b15857cea..000000000000
--- a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The T5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert T5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
deleted file mode 100644
index 91ac9f08a0a1..000000000000
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert T5X checkpoints from the original repository to JAX/FLAX model."""
-
-import argparse
-
-from t5x import checkpoints
-
-from transformers import FlaxT5ForConditionalGeneration, T5Config
-
-
-def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path):
-    config = T5Config.from_pretrained(config_name)
-    flax_model = FlaxT5ForConditionalGeneration(config=config)
-    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-
-    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]
-
-    # Encoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"]
-
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
-            t5x_attention_key
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
-            t5x_attention_out
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
-            t5x_attention_query
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
-            t5x_attention_value
-        )
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
-            t5x_attention_layer_norm
-        )
-
-        if split_mlp_wi:
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi_0"][
-                "kernel"
-            ] = t5x_mlp_wi_0
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi_1"][
-                "kernel"
-            ] = t5x_mlp_wi_1
-        else:
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi"]["kernel"] = (
-                t5x_mlp_wi
-            )
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wo"]["kernel"] = (
-            t5x_mlp_wo
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
-            t5x_mlp_layer_norm
-        )
-
-    # Only for layer 0:
-    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_encoder_rel_embedding
-
-    # Assigning
-    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
-    flax_model.params["encoder"]["final_layer_norm"]["weight"] = t5x_encoder_norm
-
-    # Decoder
-    for layer_index in range(config.num_decoder_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"][
-            "scale"
-        ]
-
-        # Encoder-Decoder-Attention
-        t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["key"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["out"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["query"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["value"][
-            "kernel"
-        ]
-
-        # Layer Normalization
-        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"]
-
-        # MLP
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
-            t5x_attention_key
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
-            t5x_attention_out
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
-            t5x_attention_query
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
-            t5x_attention_value
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
-            t5x_pre_attention_layer_norm
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"]["kernel"] = (
-            t5x_enc_dec_attention_key
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"]["kernel"] = (
-            t5x_enc_dec_attention_out
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"]["kernel"] = (
-            t5x_enc_dec_attention_query
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"]["kernel"] = (
-            t5x_enc_dec_attention_value
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
-            t5x_cross_layer_norm
-        )
-
-        if split_mlp_wi:
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi_0"][
-                "kernel"
-            ] = t5x_mlp_wi_0
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi_1"][
-                "kernel"
-            ] = t5x_mlp_wi_1
-        else:
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi"]["kernel"] = (
-                t5x_mlp_wi
-            )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wo"]["kernel"] = (
-            t5x_mlp_wo
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["layer_norm"]["weight"] = (
-            tx5_mlp_layer_norm
-        )
-
-    # Decoder Normalization
-    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
-    flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
-
-    # Only for layer 0:
-    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_decoder_rel_embedding
-
-    # Token Embeddings
-    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
-    flax_model.params["shared"]["embedding"] = tx5_token_embeddings
-
-    # LM Head (only in v1.1 checkpoints)
-    if "logits_dense" in t5x_model["target"]["decoder"]:
-        flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"]
-
-    flax_model.save_pretrained(flax_dump_folder_path)
-    print("T5X Model was sucessfully converted!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path the TX5 checkpoint."
-    )
-    parser.add_argument("--config_name", default=None, type=str, required=True, help="Config name of T5 model.")
-    parser.add_argument(
-        "--flax_dump_folder_path", default=None, type=str, required=True, help="Path to the output FLAX model."
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
deleted file mode 100755
index 5e7d9ef33d3e..000000000000
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Google LLC and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert T5X checkpoint to PyTorch
-
-Steps:
-- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
-- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
-    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
-- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
-    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
-- Convert:
-    ```
-    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
-      --pytorch_dump_path=$HOME/t5_1_1_small_pt
-    ```
-"""
-
-import argparse
-import collections
-
-import torch
-from flax import traverse_util
-from t5x import checkpoints
-
-from transformers import T5Config, T5EncoderModel, T5ForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
-    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
-    k = params[f"{prefix}/layers_{i}/{layer_name}/key/kernel"]
-    o = params[f"{prefix}/layers_{i}/{layer_name}/out/kernel"]
-    q = params[f"{prefix}/layers_{i}/{layer_name}/query/kernel"]
-    v = params[f"{prefix}/layers_{i}/{layer_name}/value/kernel"]
-    return k, o, q, v
-
-
-def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
-    """Returns the MLP parameters of a layer. Does not transpose."""
-    if split_mlp_wi:
-        wi_0 = params[f"{prefix}/layers_{i}/mlp/wi_0/kernel"]
-        wi_1 = params[f"{prefix}/layers_{i}/mlp/wi_1/kernel"]
-        wi = (wi_0, wi_1)
-    else:
-        wi = params[f"{prefix}/layers_{i}/mlp/wi/kernel"]
-
-    wo = params[f"{prefix}/layers_{i}/mlp/wo/kernel"]
-    return wi, wo
-
-
-def t5x_layer_norm_lookup(params, i, prefix, layer_name):
-    """Returns the layer norm param of a layer."""
-    return params[f"{prefix}/layers_{i}/{layer_name}/scale"]
-
-
-def convert_t5x_to_pytorch(variables: dict, *, num_layers: int, num_decoder_layers: int, is_encoder_only: bool):
-    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
-    old = traverse_util.flatten_dict(variables["target"])
-    old = {"/".join(k): v for k, v in old.items()}
-
-    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
-    split_mlp_wi = "encoder/layers_0/mlp/wi_0/kernel" in old
-    print("Split MLP:", split_mlp_wi)
-
-    new = collections.OrderedDict()
-
-    # Shared embeddings.
-    new["shared.weight"] = old["token_embedder/embedding"]
-
-    # Encoder.
-    for i in range(num_layers):
-        # Block i, layer 0 (Self Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
-        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-        # Block i, layer 1 (MLP).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
-        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
-        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-        if split_mlp_wi:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
-        else:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
-        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
-
-    new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
-        "encoder/relpos_bias/rel_embedding"
-    ].T
-    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
-
-    if not is_encoder_only:
-        # Decoder.
-        for i in range(num_decoder_layers):
-            # Block i, layer 0 (Self Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
-            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-            # Block i, layer 1 (Cross Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
-            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
-
-            # Block i, layer 2 (MLP).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
-            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
-            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
-            if split_mlp_wi:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
-            else:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
-
-        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
-        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
-            "decoder/relpos_bias/rel_embedding"
-        ].T
-
-        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
-        if "decoder/logits_dense/kernel" in old:
-            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
-
-    return new
-
-
-def make_state_dict(converted_params, is_encoder_only: bool):
-    """Prepares a state dict for the PyTorch model."""
-    # Make a state dict with torch tensors.
-    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
-
-    # Add what is missing.
-    if "encoder.embed_tokens.weight" not in state_dict:
-        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-    if not is_encoder_only:
-        if "decoder.embed_tokens.weight" not in state_dict:
-            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
-            print("Using shared word embeddings as lm_head.")
-            state_dict["lm_head.weight"] = state_dict["shared.weight"]
-
-    return state_dict
-
-
-def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
-    """Replaces the params in model witht the T5X converted params."""
-    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(
-        variables,
-        num_layers=config.num_layers,
-        num_decoder_layers=config.num_decoder_layers,
-        is_encoder_only=is_encoder_only,
-    )
-    state_dict = make_state_dict(converted, is_encoder_only)
-    model.load_state_dict(state_dict, strict=True)
-
-
-def convert_t5x_checkpoint_to_pytorch(
-    t5x_checkpoint_path, config_file, pytorch_dump_path, is_encoder_only: bool = False
-):
-    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
-    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
-    if is_encoder_only:
-        model = T5EncoderModel(config)
-    else:
-        model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Verify that we can load the checkpoint.
-    model.from_pretrained(pytorch_dump_path)
-    print("Done")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_pytorch(
-        args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path, args.is_encoder_only
-    )
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
deleted file mode 100644
index 487cdc481992..000000000000
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Table Transformer checkpoints with timm-backbone.
-
-URL: https://github.com/microsoft/table-transformer
-"""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import functional as F
-
-from transformers import DetrImageProcessor, TableTransformerConfig, TableTransformerForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-# convolutional projection + query embeddings + layernorm of encoder + layernorm of decoder + class and bounding box heads
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
-        ("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict):
-    prefix = ""
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-def resize(image, checkpoint_url):
-    width, height = image.size
-    current_max_size = max(width, height)
-    target_max_size = 800 if "detection" in checkpoint_url else 1000
-    scale = target_max_size / current_max_size
-    resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
-
-    return resized_image
-
-
-def normalize(image):
-    image = F.to_tensor(image)
-    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    return image
-
-
-@torch.no_grad()
-def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # rename keys
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # create HuggingFace model and load state dict
-    config = TableTransformerConfig(
-        backbone="resnet18",
-        mask_loss_coefficient=1,
-        dice_loss_coefficient=1,
-        ce_loss_coefficient=1,
-        bbox_loss_coefficient=5,
-        giou_loss_coefficient=2,
-        eos_coefficient=0.4,
-        class_cost=1,
-        bbox_cost=5,
-        giou_cost=2,
-    )
-
-    if "detection" in checkpoint_url:
-        config.num_queries = 15
-        config.num_labels = 2
-        id2label = {0: "table", 1: "table rotated"}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.num_queries = 125
-        config.num_labels = 6
-        id2label = {
-            0: "table",
-            1: "table column",
-            2: "table row",
-            3: "table column header",
-            4: "table projected row header",
-            5: "table spanning cell",
-        }
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    image_processor = DetrImageProcessor(
-        format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000
-    )
-    model = TableTransformerForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion
-    filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
-    file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
-    image = Image.open(file_path).convert("RGB")
-    pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
-
-    outputs = model(pixel_values)
-
-    if "detection" in checkpoint_url:
-        expected_shape = (1, 15, 3)
-        expected_logits = torch.tensor(
-            [[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
-        )
-        expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
-
-    else:
-        expected_shape = (1, 125, 7)
-        expected_logits = torch.tensor(
-            [[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
-        )
-        expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model to HF hub
-        logger.info("Pushing model to the hub...")
-        model_name = (
-            "microsoft/table-transformer-detection"
-            if "detection" in checkpoint_url
-            else "microsoft/table-transformer-structure-recognition"
-        )
-        model.push_to_hub(model_name)
-        image_processor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-        type=str,
-        choices=[
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
-        ],
-        help="URL of the Table Transformer checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
deleted file mode 100644
index 1073d4887743..000000000000
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Table Transformer checkpoints with native (Transformers) backbone.
-
-URL: https://github.com/microsoft/table-transformer
-"""
-
-import argparse
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import functional as F
-
-from transformers import DetrImageProcessor, ResNetConfig, TableTransformerConfig, TableTransformerForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv1.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.bias",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.running_mean",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.running_mean",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.running_var",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.running_var",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv2.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.bias",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.running_mean",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.running_mean",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.running_var",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.running_var",
-                )
-            )
-            # all ResNet stages except the first one have a downsample as first layer
-            if stage_idx != 0 and layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        # "backbone.conv_encoder.model.encoder.stages.3.layers.0.shortcut.normalization.running_var"
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
-                f"encoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("input_proj.weight", "input_projection.weight"),
-            ("input_proj.bias", "input_projection.bias"),
-            ("query_embed.weight", "query_position_embeddings.weight"),
-            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-            ("class_embed.weight", "class_labels_classifier.weight"),
-            ("class_embed.bias", "class_labels_classifier.bias"),
-            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-            ("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
-            ("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-def resize(image, checkpoint_url):
-    width, height = image.size
-    current_max_size = max(width, height)
-    target_max_size = 800 if "detection" in checkpoint_url else 1000
-    scale = target_max_size / current_max_size
-    resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
-
-    return resized_image
-
-
-def normalize(image):
-    image = F.to_tensor(image)
-    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    return image
-
-
-@torch.no_grad()
-def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    logger.info("Converting model...")
-
-    # create HuggingFace model and load state dict
-    backbone_config = ResNetConfig.from_pretrained(
-        "microsoft/resnet-18", out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-
-    config = TableTransformerConfig(
-        backbone_config=backbone_config,
-        use_timm_backbone=False,
-        mask_loss_coefficient=1,
-        dice_loss_coefficient=1,
-        ce_loss_coefficient=1,
-        bbox_loss_coefficient=5,
-        giou_loss_coefficient=2,
-        eos_coefficient=0.4,
-        class_cost=1,
-        bbox_cost=5,
-        giou_cost=2,
-    )
-
-    # load original state dict
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-
-    if "detection" in checkpoint_url:
-        config.num_queries = 15
-        config.num_labels = 2
-        id2label = {0: "table", 1: "table rotated"}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.num_queries = 125
-        config.num_labels = 6
-        id2label = {
-            0: "table",
-            1: "table column",
-            2: "table row",
-            3: "table column header",
-            4: "table projected row header",
-            5: "table spanning cell",
-        }
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    image_processor = DetrImageProcessor(format="coco_detection", size={"longest_edge": 800})
-    model = TableTransformerForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion
-    filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
-    file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
-    image = Image.open(file_path).convert("RGB")
-    pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
-
-    outputs = model(pixel_values)
-
-    if "detection" in checkpoint_url:
-        expected_shape = (1, 15, 3)
-        expected_logits = torch.tensor(
-            [[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
-        )
-        expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
-
-    else:
-        expected_shape = (1, 125, 7)
-        expected_logits = torch.tensor(
-            [[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
-        )
-        expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model to HF hub
-        logger.info("Pushing model to the hub...")
-        model_name = (
-            "microsoft/table-transformer-detection"
-            if "detection" in checkpoint_url
-            else "microsoft/table-transformer-structure-recognition"
-        )
-        model.push_to_hub(model_name, revision="no_timm")
-        image_processor.push_to_hub(model_name, revision="no_timm")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-        type=str,
-        choices=[
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
-        ],
-        help="URL of the Table Transformer checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 34bf77cccd6b..000000000000
--- a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TAPAS checkpoint."""
-
-import argparse
-
-from transformers import (
-    TapasConfig,
-    TapasForMaskedLM,
-    TapasForQuestionAnswering,
-    TapasForSequenceClassification,
-    TapasModel,
-    TapasTokenizer,
-    load_tf_weights_in_tapas,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(
-    task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path
-):
-    # Initialise PyTorch model.
-    # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of
-    # TapasConfig to False.
-
-    # initialize configuration from json file
-    config = TapasConfig.from_json_file(tapas_config_file)
-    # set absolute/relative position embeddings parameter
-    config.reset_position_index_per_cell = reset_position_index_per_cell
-
-    # set remaining parameters of TapasConfig as well as the model based on the task
-    if task == "SQA":
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "WTQ":
-        # run_task_main.py hparams
-        config.num_aggregation_labels = 4
-        config.use_answer_as_supervision = True
-        # hparam_utils.py hparams
-        config.answer_loss_cutoff = 0.664694
-        config.cell_selection_preference = 0.207951
-        config.huber_loss_delta = 0.121194
-        config.init_cell_selection_weights_to_zero = True
-        config.select_one_column = True
-        config.allow_empty_column_selection = False
-        config.temperature = 0.0352513
-
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "WIKISQL_SUPERVISED":
-        # run_task_main.py hparams
-        config.num_aggregation_labels = 4
-        config.use_answer_as_supervision = False
-        # hparam_utils.py hparams
-        config.answer_loss_cutoff = 36.4519
-        config.cell_selection_preference = 0.903421
-        config.huber_loss_delta = 222.088
-        config.init_cell_selection_weights_to_zero = True
-        config.select_one_column = True
-        config.allow_empty_column_selection = True
-        config.temperature = 0.763141
-
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "TABFACT":
-        model = TapasForSequenceClassification(config=config)
-    elif task == "MLM":
-        model = TapasForMaskedLM(config=config)
-    elif task == "INTERMEDIATE_PRETRAINING":
-        model = TapasModel(config=config)
-    else:
-        raise ValueError(f"Task {task} not supported.")
-
-    print(f"Building PyTorch model from configuration: {config}")
-    # Load weights from tf checkpoint
-    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model (weights and configuration)
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Save tokenizer files
-    print(f"Save tokenizer files to {pytorch_dump_path}")
-    tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512)
-    tokenizer.save_pretrained(pytorch_dump_path)
-
-    print("Used relative position embeddings:", model.config.reset_position_index_per_cell)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--task", default="SQA", type=str, help="Model task for which to convert a checkpoint. Defaults to SQA."
-    )
-    parser.add_argument(
-        "--reset_position_index_per_cell",
-        default=False,
-        action="store_true",
-        help="Whether to use relative position embeddings or not. Defaults to True.",
-    )
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--tapas_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained TAPAS model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.task,
-        args.reset_position_index_per_cell,
-        args.tf_checkpoint_path,
-        args.tapas_config_file,
-        args.pytorch_dump_path,
-    )
diff --git a/src/transformers/models/textnet/convert_textnet_to_hf.py b/src/transformers/models/textnet/convert_textnet_to_hf.py
deleted file mode 100644
index a8a004d18a35..000000000000
--- a/src/transformers/models/textnet/convert_textnet_to_hf.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# coding=utf-8
-# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import logging
-import re
-from collections import OrderedDict
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
-
-
-tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
-small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
-base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
-
-rename_key_mappings = {
-    "module.backbone": "textnet",
-    "first_conv": "stem",
-    "bn": "batch_norm",
-    "ver": "vertical",
-    "hor": "horizontal",
-}
-
-
-def prepare_config(size_config_url, size):
-    config_dict = json.loads(requests.get(size_config_url).text)
-
-    backbone_config = {}
-    for stage_ix in range(1, 5):
-        stage_config = config_dict[f"stage{stage_ix}"]
-
-        merged_dict = {}
-
-        # Iterate through the list of dictionaries
-        for layer in stage_config:
-            for key, value in layer.items():
-                if key != "name":
-                    # Check if the key is already in the merged_dict
-                    if key in merged_dict:
-                        merged_dict[key].append(value)
-                    else:
-                        # If the key is not in merged_dict, create a new list with the value
-                        merged_dict[key] = [value]
-        backbone_config[f"stage{stage_ix}"] = merged_dict
-
-    neck_in_channels = []
-    neck_out_channels = []
-    neck_kernel_size = []
-    neck_stride = []
-    neck_dilation = []
-    neck_groups = []
-
-    for i in range(1, 5):
-        layer_key = f"reduce_layer{i}"
-        layer_dict = config_dict["neck"].get(layer_key)
-
-        if layer_dict:
-            # Append values to the corresponding lists
-            neck_in_channels.append(layer_dict["in_channels"])
-            neck_out_channels.append(layer_dict["out_channels"])
-            neck_kernel_size.append(layer_dict["kernel_size"])
-            neck_stride.append(layer_dict["stride"])
-            neck_dilation.append(layer_dict["dilation"])
-            neck_groups.append(layer_dict["groups"])
-
-    textnet_config = TextNetConfig(
-        stem_kernel_size=config_dict["first_conv"]["kernel_size"],
-        stem_stride=config_dict["first_conv"]["stride"],
-        stem_num_channels=config_dict["first_conv"]["in_channels"],
-        stem_out_channels=config_dict["first_conv"]["out_channels"],
-        stem_act_func=config_dict["first_conv"]["act_func"],
-        conv_layer_kernel_sizes=[
-            backbone_config["stage1"]["kernel_size"],
-            backbone_config["stage2"]["kernel_size"],
-            backbone_config["stage3"]["kernel_size"],
-            backbone_config["stage4"]["kernel_size"],
-        ],
-        conv_layer_strides=[
-            backbone_config["stage1"]["stride"],
-            backbone_config["stage2"]["stride"],
-            backbone_config["stage3"]["stride"],
-            backbone_config["stage4"]["stride"],
-        ],
-        hidden_sizes=[
-            config_dict["first_conv"]["out_channels"],
-            backbone_config["stage1"]["out_channels"][-1],
-            backbone_config["stage2"]["out_channels"][-1],
-            backbone_config["stage3"]["out_channels"][-1],
-            backbone_config["stage4"]["out_channels"][-1],
-        ],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-    )
-
-    return textnet_config
-
-
-def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
-    config_filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
-
-    with open(config_filepath) as f:
-        content = json.loads(f.read())
-
-    size = content[checkpoint_config_filename]["short_size"]
-
-    if "tiny" in content[checkpoint_config_filename]["config"]:
-        config = prepare_config(tiny_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 1.1221]
-        )
-    elif "small" in content[checkpoint_config_filename]["config"]:
-        config = prepare_config(small_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1394]
-        )
-    else:
-        config = prepare_config(base_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000]
-        )
-
-    model = TextNetBackbone(config)
-    textnet_image_processor = TextNetImageProcessor(size={"shortest_edge": size})
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
-    state_dict_changed = OrderedDict()
-    for key in state_dict:
-        if "backbone" in key:
-            val = state_dict[key]
-            new_key = key
-            for search, replacement in rename_key_mappings.items():
-                if search in new_key:
-                    new_key = new_key.replace(search, replacement)
-
-            pattern = r"textnet\.stage(\d)"
-
-            def adjust_stage(match):
-                stage_number = int(match.group(1)) - 1
-                return f"textnet.encoder.stages.{stage_number}.stage"
-
-            # Using regex to find and replace the pattern in the string
-            new_key = re.sub(pattern, adjust_stage, new_key)
-            state_dict_changed[new_key] = val
-    model.load_state_dict(state_dict_changed)
-    model.eval()
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    original_pixel_values = torch.tensor(
-        [0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309, 0.4337]
-    )
-    pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values[0][0][3][:10], atol=1e-4)
-
-    with torch.no_grad():
-        output = model(pixel_values)
-
-    assert torch.allclose(output["feature_maps"][-1][0][10][12][:10].detach(), expected_slice_backbone, atol=1e-3)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
-    logging.info("The converted weights are saved here : " + pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/czczup/FAST/releases/download/release/fast_base_ic17mlt_640.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--checkpoint_config_filename",
-        default="fast_base_ic17mlt_640.py",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-
-    convert_textnet_checkpoint(
-        args.checkpoint_url,
-        args.checkpoint_config_filename,
-        args.pytorch_dump_folder_path,
-    )
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
deleted file mode 100644
index ce4d13421ffd..000000000000
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TimeSformer checkpoints from the original repository: https://github.com/MCG-NJU/TimeSformer"""
-
-import argparse
-import json
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEImageProcessor
-
-
-def get_timesformer_config(model_name):
-    config = TimesformerConfig()
-
-    if "large" in model_name:
-        config.num_frames = 96
-
-    if "hr" in model_name:
-        config.num_frames = 16
-        config.image_size = 448
-
-    repo_id = "huggingface/label-files"
-    if "k400" in model_name:
-        config.num_labels = 400
-        filename = "kinetics400-id2label.json"
-    elif "k600" in model_name:
-        config.num_labels = 600
-        filename = "kinetics600-id2label.json"
-    elif "ssv2" in model_name:
-        config.num_labels = 174
-        filename = "something-something-v2-id2label.json"
-    else:
-        raise ValueError("Model name should either contain 'k400', 'k600' or 'ssv2'.")
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name):
-    if "encoder." in name:
-        name = name.replace("encoder.", "")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "timesformer.embeddings.cls_token")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "timesformer.embeddings.position_embeddings")
-    if "time_embed" in name:
-        name = name.replace("time_embed", "timesformer.embeddings.time_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "timesformer.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "timesformer.embeddings.norm")
-    if "blocks" in name:
-        name = name.replace("blocks", "timesformer.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name and "bias" not in name and "temporal" not in name:
-        name = name.replace("attn", "attention.self")
-    if "attn" in name and "temporal" not in name:
-        name = name.replace("attn", "attention.attention")
-    if "temporal_norm1" in name:
-        name = name.replace("temporal_norm1", "temporal_layernorm")
-    if "temporal_attn.proj" in name:
-        name = name.replace("temporal_attn", "temporal_attention.output.dense")
-    if "temporal_fc" in name:
-        name = name.replace("temporal_fc", "temporal_dense")
-    if "norm1" in name and "temporal" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm.weight" in name and "fc" not in name and "temporal" not in name:
-        name = name.replace("norm.weight", "timesformer.layernorm.weight")
-    if "norm.bias" in name and "fc" not in name and "temporal" not in name:
-        name = name.replace("norm.bias", "timesformer.layernorm.bias")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("model."):
-            key = key.replace("model.", "")
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            prefix = "timesformer.encoder.layer."
-            if "temporal" in key:
-                postfix = ".temporal_attention.attention.qkv."
-            else:
-                postfix = ".attention.attention.qkv."
-            if "weight" in key:
-                orig_state_dict[f"{prefix}{layer_num}{postfix}weight"] = val
-            else:
-                orig_state_dict[f"{prefix}{layer_num}{postfix}bias"] = val
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
-    config = get_timesformer_config(model_name)
-
-    model = TimesformerForVideoClassification(config)
-
-    # download original checkpoint, hosted on Google Drive
-    output = "pytorch_model.bin"
-    gdown.cached_download(checkpoint_url, output, quiet=False)
-    files = torch.load(output, map_location="cpu")
-    if "model" in files:
-        state_dict = files["model"]
-    elif "module" in files:
-        state_dict = files["module"]
-    else:
-        state_dict = files["model_state"]
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    # verify model on basic input
-    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-    video = prepare_video()
-    inputs = image_processor(video[:8], return_tensors="pt")
-
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    model_names = [
-        # Kinetics-400 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-k400",
-        "timesformer-large-finetuned-k400",
-        "timesformer-hr-finetuned-k400",
-        # Kinetics-600 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-k600",
-        "timesformer-large-finetuned-k600",
-        "timesformer-hr-finetuned-k600",
-        # Something-Something-v2 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-ssv2",
-        "timesformer-large-finetuned-ssv2",
-        "timesformer-hr-finetuned-ssv2",
-    ]
-
-    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
-    if model_name == "timesformer-base-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.3016, -0.7713, -0.4205])
-    elif model_name == "timesformer-base-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([-0.7267, -0.7466, 3.2404])
-    elif model_name == "timesformer-base-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-0.9059, 0.6433, -3.1457])
-    elif model_name == "timesformer-large-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-large-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-large-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-hr-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.9617, -3.7311, -3.7708])
-    elif model_name == "timesformer-hr-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([2.5273, 0.7127, 1.8848])
-    elif model_name == "timesformer-hr-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-3.6756, -0.7513, 0.7180])
-    else:
-        raise ValueError(f"Model name not supported. Should be one of {model_names}")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
-    print("Logits ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        model.push_to_hub(f"fcakyon/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://drive.google.com/u/1/uc?id=17yvuYp9L4mn-HpIcK5Zo6K3UoOy1kA5l&export=download",
-        type=str,
-        help=(
-            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
-            " download link."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--model_name", default="timesformer-base-finetuned-k400", type=str, help="Name of the model.")
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_timesformer_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub
-    )
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
deleted file mode 100644
index a787932b7694..000000000000
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TrOCR checkpoints from the unilm repository."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    RobertaTokenizer,
-    TrOCRConfig,
-    TrOCRForCausalLM,
-    TrOCRProcessor,
-    VisionEncoderDecoderModel,
-    ViTConfig,
-    ViTImageProcessor,
-    ViTModel,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(encoder_config, decoder_config):
-    rename_keys = []
-    for i in range(encoder_config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.norm1.weight", f"encoder.encoder.layer.{i}.layernorm_before.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.norm1.bias", f"encoder.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.attn.proj.weight", f"encoder.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.attn.proj.bias", f"encoder.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.norm2.weight", f"encoder.encoder.layer.{i}.layernorm_after.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.norm2.bias", f"encoder.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc1.weight", f"encoder.encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc1.bias", f"encoder.encoder.layer.{i}.intermediate.dense.bias")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc2.weight", f"encoder.encoder.layer.{i}.output.dense.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.mlp.fc2.bias", f"encoder.encoder.layer.{i}.output.dense.bias"))
-
-    # cls token, position embeddings and patch embeddings of encoder
-    rename_keys.extend(
-        [
-            ("encoder.deit.cls_token", "encoder.embeddings.cls_token"),
-            ("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),
-            ("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),
-            ("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),
-            ("encoder.deit.norm.weight", "encoder.layernorm.weight"),
-            ("encoder.deit.norm.bias", "encoder.layernorm.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, encoder_config):
-    for i in range(encoder_config.num_hidden_layers):
-        # queries, keys and values (only weights, no biases)
-        in_proj_weight = state_dict.pop(f"encoder.deit.blocks.{i}.attn.qkv.weight")
-
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : encoder_config.hidden_size, :
-        ]
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            encoder_config.hidden_size : encoder_config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -encoder_config.hidden_size :, :
-        ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of the IAM Handwriting Database
-def prepare_img(checkpoint_url):
-    if "handwritten" in checkpoint_url:
-        url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg"  # industry
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-12.jpg" # have
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-10.jpg" # let
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"  #
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122.jpg"
-    elif "printed" in checkpoint_url or "stage1" in checkpoint_url:
-        url = "https://www.researchgate.net/profile/Dinh-Sang/publication/338099565/figure/fig8/AS:840413229350922@1577381536857/An-receipt-example-in-the-SROIE-2019-dataset_Q640.jpg"
-    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return im
-
-
-@torch.no_grad()
-def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure.
-    """
-    # define encoder and decoder configs based on checkpoint_url
-    encoder_config = ViTConfig(image_size=384, qkv_bias=False)
-    decoder_config = TrOCRConfig()
-
-    # size of the architecture
-    if "base" in checkpoint_url:
-        decoder_config.encoder_hidden_size = 768
-    elif "large" in checkpoint_url:
-        # use ViT-large encoder
-        encoder_config.hidden_size = 1024
-        encoder_config.intermediate_size = 4096
-        encoder_config.num_hidden_layers = 24
-        encoder_config.num_attention_heads = 16
-        decoder_config.encoder_hidden_size = 1024
-    else:
-        raise ValueError("Should either find 'base' or 'large' in checkpoint URL")
-
-    # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards
-    if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
-        decoder_config.tie_word_embeddings = False
-        decoder_config.activation_function = "relu"
-        decoder_config.max_position_embeddings = 1024
-        decoder_config.scale_embedding = True
-        decoder_config.use_learned_position_embeddings = False
-        decoder_config.layernorm_embedding = False
-
-    # load HuggingFace model
-    encoder = ViTModel(encoder_config, add_pooling_layer=False)
-    decoder = TrOCRForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    # load state_dict of original model, rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"]
-
-    rename_keys = create_rename_keys(encoder_config, decoder_config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, encoder_config)
-
-    # remove parameters we don't need
-    del state_dict["encoder.deit.head.weight"]
-    del state_dict["encoder.deit.head.bias"]
-    del state_dict["decoder.version"]
-
-    # add prefix to decoder keys
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("decoder") and "output_projection" not in key:
-            state_dict["decoder.model." + key] = val
-        else:
-            state_dict[key] = val
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    image_processor = ViTImageProcessor(size=encoder_config.image_size)
-    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-large")
-    processor = TrOCRProcessor(image_processor, tokenizer)
-
-    pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
-
-    # verify logits
-    decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
-    outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-    logits = outputs.logits
-
-    expected_shape = torch.Size([1, 1, 50265])
-    if "trocr-base-handwritten" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311]
-        )
-    elif "trocr-large-handwritten" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170]
-        )
-    elif "trocr-base-printed" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210]
-        )
-    elif "trocr-large-printed" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535]
-        )
-
-    if "stage1" not in checkpoint_url:
-        assert logits.shape == expected_shape, "Shape of logits not as expected"
-        assert torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected"
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving processor to {pytorch_dump_folder_path}")
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://layoutlm.blob.core.windows.net/trocr/model_zoo/fairseq/trocr-base-handwritten.pt",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tr_ocr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/udop/convert_udop_to_hf.py b/src/transformers/models/udop/convert_udop_to_hf.py
deleted file mode 100644
index f2d54b8ca542..000000000000
--- a/src/transformers/models/udop/convert_udop_to_hf.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UDOP checkpoints from the original repository. URL: https://github.com/microsoft/i-Code/tree/main/i-Code-Doc"""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms as T
-
-from transformers import (
-    LayoutLMv3ImageProcessor,
-    UdopConfig,
-    UdopForConditionalGeneration,
-    UdopProcessor,
-    UdopTokenizer,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-
-def original_transform(image, image_size=224):
-    transform = T.Compose(
-        [
-            T.Resize([image_size, image_size]),
-            T.ToTensor(),
-            T.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-        ]
-    )
-
-    image = transform(image)
-    return image
-
-
-def get_image():
-    filepath = hf_hub_download(
-        repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
-    )
-    image = Image.open(filepath).convert("RGB")
-
-    return image
-
-
-def prepare_dummy_inputs(tokenizer, image_processor):
-    prompt = "Question answering. What is the name of the company?"
-    prompt = "Question answering. In which year is the report made?"
-    prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
-
-    image = get_image()
-    # words, boxes = apply_tesseract(image, lang=None)
-    # fmt: off
-    words = ['7', 'ITC', 'Limited', 'REPORT', 'AND', 'ACCOUNTS', '2013', 'ITC’s', 'Brands:', 'An', 'Asset', 'for', 'the', 'Nation', 'The', 'consumer', 'needs', 'and', 'aspirations', 'they', 'fulfil,', 'the', 'benefit', 'they', 'generate', 'for', 'millions', 'across', 'ITC’s', 'value', 'chains,', 'the', 'future-ready', 'capabilities', 'that', 'support', 'them,', 'and', 'the', 'value', 'that', 'they', 'create', 'for', 'the', 'country,', 'have', 'made', 'ITC’s', 'brands', 'national', 'assets,', 'adding', 'to', 'India’s', 'competitiveness.', 'It', 'is', 'ITC’s', 'aspiration', 'to', 'be', 'the', 'No', '1', 'FMCG', 'player', 'in', 'the', 'country,', 'driven', 'by', 'its', 'new', 'FMCG', 'businesses.', 'A', 'recent', 'Nielsen', 'report', 'has', 'highlighted', 'that', "ITC's", 'new', 'FMCG', 'businesses', 'are', 'the', 'fastest', 'growing', 'among', 'the', 'top', 'consumer', 'goods', 'companies', 'operating', 'in', 'India.', 'ITC', 'takes', 'justifiable', 'pride', 'that,', 'along', 'with', 'generating', 'economic', 'value,', 'these', 'celebrated', 'Indian', 'brands', 'also', 'drive', 'the', 'creation', 'of', 'larger', 'societal', 'capital', 'through', 'the', 'virtuous', 'cycle', 'of', 'sustainable', 'and', 'inclusive', 'growth.', 'DI', 'WILLS', '*', ';', 'LOVE', 'DELIGHTFULLY', 'SOFT', 'SKIN?', 'aia', 'Ans', 'Source:', 'https://www.industrydocuments.ucsf.edu/docs/snbx0223']
-    boxes = [[0, 45, 67, 80], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [175, 137, 306, 158], [318, 137, 363, 158], [374, 137, 472, 158], [483, 136, 529, 158], [540, 137, 593, 158], [608, 137, 717, 158], [73, 194, 100, 203], [106, 196, 177, 203], [183, 194, 227, 203], [233, 194, 259, 203], [265, 194, 344, 205], [74, 211, 104, 222], [109, 210, 141, 221], [147, 211, 169, 220], [175, 210, 223, 220], [229, 211, 259, 222], [265, 211, 329, 222], [334, 210, 352, 220], [74, 227, 127, 236], [133, 229, 180, 236], [187, 227, 221, 236], [226, 227, 264, 236], [270, 227, 320, 237], [327, 227, 349, 236], [74, 243, 161, 254], [166, 243, 249, 254], [254, 243, 281, 252], [286, 244, 342, 254], [74, 260, 112, 270], [119, 260, 145, 269], [151, 260, 174, 269], [179, 260, 217, 269], [222, 260, 249, 269], [254, 260, 285, 271], [290, 260, 335, 269], [340, 259, 359, 269], [74, 276, 95, 284], [101, 276, 156, 287], [164, 276, 198, 284], [203, 276, 244, 284], [251, 275, 285, 284], [291, 276, 340, 284], [74, 292, 129, 301], [135, 292, 185, 302], [192, 292, 242, 303], [248, 292, 261, 301], [267, 292, 312, 301], [74, 308, 195, 319], [75, 335, 82, 344], [88, 335, 98, 344], [105, 335, 138, 344], [144, 335, 214, 346], [220, 336, 233, 344], [239, 335, 256, 344], [262, 335, 283, 344], [290, 335, 309, 344], [316, 335, 320, 344], [74, 351, 119, 360], [126, 352, 170, 362], [176, 352, 186, 360], [192, 352, 214, 360], [220, 352, 276, 362], [282, 352, 326, 360], [333, 352, 349, 362], [74, 368, 89, 377], [95, 370, 124, 377], [129, 367, 175, 377], [181, 368, 266, 377], [272, 368, 283, 376], [289, 368, 333, 377], [74, 384, 126, 393], [134, 385, 175, 395], [181, 384, 206, 393], [212, 384, 292, 395], [298, 384, 325, 393], [330, 384, 366, 393], [74, 403, 103, 409], [109, 400, 154, 409], [161, 401, 241, 409], [247, 403, 269, 409], [275, 401, 296, 409], [302, 400, 349, 409], [74, 417, 131, 428], [137, 419, 186, 428], [192, 417, 214, 426], [219, 417, 242, 428], [248, 419, 319, 426], [74, 433, 119, 444], [125, 433, 204, 444], [210, 433, 278, 444], [285, 433, 295, 441], [302, 433, 340, 442], [75, 449, 98, 458], [104, 449, 142, 458], [146, 449, 215, 460], [221, 449, 258, 460], [263, 449, 293, 459], [300, 449, 339, 460], [74, 466, 101, 474], [108, 466, 185, 476], [191, 466, 261, 474], [267, 466, 309, 476], [315, 466, 354, 474], [74, 482, 151, 491], [158, 482, 201, 491], [208, 482, 258, 491], [263, 482, 292, 491], [298, 482, 333, 491], [338, 482, 360, 491], [74, 498, 131, 507], [137, 498, 150, 507], [156, 498, 197, 509], [202, 498, 257, 507], [263, 498, 310, 509], [74, 515, 128, 525], [134, 515, 156, 523], [161, 515, 218, 523], [223, 515, 261, 525], [267, 514, 280, 523], [74, 531, 156, 540], [162, 531, 188, 540], [195, 531, 257, 540], [263, 531, 315, 542], [871, 199, 878, 202], [883, 199, 908, 202], [894, 251, 904, 257], [841, 268, 841, 270], [784, 373, 811, 378], [816, 373, 896, 378], [784, 381, 811, 387], [815, 381, 847, 387], [645, 908, 670, 915], [692, 908, 712, 915], [220, 984, 285, 993], [293, 983, 779, 996]]
-    # fmt: on
-    text_list = []
-    bbox_list = []
-    for text, box in zip(words, boxes):
-        if text == "":
-            continue
-        sub_tokens = tokenizer.tokenize(text)
-        for sub_token in sub_tokens:
-            text_list.append(sub_token)
-            bbox_list.append(box)
-
-    input_ids = tokenizer.convert_tokens_to_ids(text_list)
-
-    input_ids = prompt_ids + input_ids
-    bbox = [[0, 0, 0, 0]] * len(prompt_ids) + bbox_list
-
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values
-    original_pixel_values = original_transform(image, image_size=image_processor.size["height"]).unsqueeze(0)
-    # verify pixel values
-    assert torch.allclose(original_pixel_values, pixel_values)
-    print("Pixel values are ok!")
-
-    return torch.tensor(input_ids).unsqueeze(0), torch.tensor(bbox).unsqueeze(0).float(), pixel_values
-
-
-def convert_udop_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # model_name to checkpoint_path
-    name_to_checkpoint_path = {
-        "udop-large": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-224/pytorch_model.bin",
-        "udop-large-512": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512/pytorch_model.bin",
-        "udop-large-512-300k": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512-300k-steps/pytorch_model.bin",
-    }
-
-    # load original state dict
-    checkpoint_path = name_to_checkpoint_path[model_name]
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    print("Checkpoint path:", checkpoint_path)
-
-    # create HF model
-    image_size = 512 if "512" in model_name else 224
-    config = UdopConfig(decoder_start_token_id=0, image_size=image_size)
-    model = UdopForConditionalGeneration(config)
-    model.eval()
-
-    # rename keys
-    state_dict = {k.replace("cell2dembedding", "cell_2d_embedding"): v for k, v in state_dict.items()}
-
-    # load weights
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == ["encoder.embed_patches.proj.weight", "encoder.embed_patches.proj.bias"]
-    assert unexpected_keys == ["pos_embed"]
-
-    # Add extra_ids to the special token list
-    # NOTE special tokens have a unique order
-    # see https://github.com/huggingface/transformers/issues/29591 for details
-    # fmt: off
-    additional_special_tokens = ['<extra_id_99>', '<extra_id_98>', '<extra_id_97>', '<extra_id_96>', '<extra_id_95>', '<extra_id_94>', '<extra_id_93>', '<extra_id_92>', '<extra_id_91>', '<extra_id_90>', '<extra_id_89>', '<extra_id_88>', '<extra_id_87>', '<extra_id_86>', '<extra_id_85>', '<extra_id_84>', '<extra_id_83>', '<extra_id_82>', '<extra_id_81>', '<extra_id_80>', '<extra_id_79>', '<extra_id_78>', '<extra_id_77>', '<extra_id_76>', '<extra_id_75>', '<extra_id_74>', '<extra_id_73>', '<extra_id_72>', '<extra_id_71>', '<extra_id_70>', '<extra_id_69>', '<extra_id_68>', '<extra_id_67>', '<extra_id_66>', '<extra_id_65>', '<extra_id_64>', '<extra_id_63>', '<extra_id_62>', '<extra_id_61>', '<extra_id_60>', '<extra_id_59>', '<extra_id_58>', '<extra_id_57>', '<extra_id_56>', '<extra_id_55>', '<extra_id_54>', '<extra_id_53>', '<extra_id_52>', '<extra_id_51>', '<extra_id_50>', '<extra_id_49>', '<extra_id_48>', '<extra_id_47>', '<extra_id_46>', '<extra_id_45>', '<extra_id_44>', '<extra_id_43>', '<extra_id_42>', '<extra_id_41>', '<extra_id_40>', '<extra_id_39>', '<extra_id_38>', '<extra_id_37>', '<extra_id_36>', '<extra_id_35>', '<extra_id_34>', '<extra_id_33>', '<extra_id_32>', '<extra_id_31>', '<extra_id_30>', '<extra_id_29>', '<extra_id_28>', '<extra_id_27>', '<extra_id_26>', '<extra_id_25>', '<extra_id_24>', '<extra_id_23>', '<extra_id_22>', '<extra_id_21>', '<extra_id_20>', '<extra_id_19>', '<extra_id_18>', '<extra_id_17>', '<extra_id_16>', '<extra_id_15>', '<extra_id_14>', '<extra_id_13>', '<extra_id_12>', '<extra_id_11>', '<extra_id_10>', '<extra_id_9>', '<extra_id_8>', '<extra_id_7>', '<extra_id_6>', '<extra_id_5>', '<extra_id_4>', '<extra_id_3>', '<extra_id_2>', '<extra_id_1>', '<extra_id_0>', '<extra_l_id_99>', '<extra_l_id_98>', '<extra_l_id_97>', '<extra_l_id_96>', '<extra_l_id_95>', '<extra_l_id_94>', '<extra_l_id_93>', '<extra_l_id_92>', '<extra_l_id_91>', '<extra_l_id_90>', '<extra_l_id_89>', '<extra_l_id_88>', '<extra_l_id_87>', '<extra_l_id_86>', '<extra_l_id_85>', '<extra_l_id_84>', '<extra_l_id_83>', '<extra_l_id_82>', '<extra_l_id_81>', '<extra_l_id_80>', '<extra_l_id_79>', '<extra_l_id_78>', '<extra_l_id_77>', '<extra_l_id_76>', '<extra_l_id_75>', '<extra_l_id_74>', '<extra_l_id_73>', '<extra_l_id_72>', '<extra_l_id_71>', '<extra_l_id_70>', '<extra_l_id_69>', '<extra_l_id_68>', '<extra_l_id_67>', '<extra_l_id_66>', '<extra_l_id_65>', '<extra_l_id_64>', '<extra_l_id_63>', '<extra_l_id_62>', '<extra_l_id_61>', '<extra_l_id_60>', '<extra_l_id_59>', '<extra_l_id_58>', '<extra_l_id_57>', '<extra_l_id_56>', '<extra_l_id_55>', '<extra_l_id_54>', '<extra_l_id_53>', '<extra_l_id_52>', '<extra_l_id_51>', '<extra_l_id_50>', '<extra_l_id_49>', '<extra_l_id_48>', '<extra_l_id_47>', '<extra_l_id_46>', '<extra_l_id_45>', '<extra_l_id_44>', '<extra_l_id_43>', '<extra_l_id_42>', '<extra_l_id_41>', '<extra_l_id_40>', '<extra_l_id_39>', '<extra_l_id_38>', '<extra_l_id_37>', '<extra_l_id_36>', '<extra_l_id_35>', '<extra_l_id_34>', '<extra_l_id_33>', '<extra_l_id_32>', '<extra_l_id_31>', '<extra_l_id_30>', '<extra_l_id_29>', '<extra_l_id_28>', '<extra_l_id_27>', '<extra_l_id_26>', '<extra_l_id_25>', '<extra_l_id_24>', '<extra_l_id_23>', '<extra_l_id_22>', '<extra_l_id_21>', '<extra_l_id_20>', '<extra_l_id_19>', '<extra_l_id_18>', '<extra_l_id_17>', '<extra_l_id_16>', '<extra_l_id_15>', '<extra_l_id_14>', '<extra_l_id_13>', '<extra_l_id_12>', '<extra_l_id_11>', '<extra_l_id_10>', '<extra_l_id_9>', '<extra_l_id_8>', '<extra_l_id_7>', '<extra_l_id_6>', '<extra_l_id_5>', '<extra_l_id_4>', '<extra_l_id_3>', '<extra_l_id_2>', '<extra_l_id_1>', '<extra_l_id_0>', '</extra_l_id_99>', '</extra_l_id_98>', '</extra_l_id_97>', '</extra_l_id_96>', '</extra_l_id_95>', '</extra_l_id_94>', '</extra_l_id_93>', '</extra_l_id_92>', '</extra_l_id_91>', '</extra_l_id_90>', '</extra_l_id_89>', '</extra_l_id_88>', '</extra_l_id_87>', '</extra_l_id_86>', '</extra_l_id_85>', '</extra_l_id_84>', '</extra_l_id_83>', '</extra_l_id_82>', '</extra_l_id_81>', '</extra_l_id_80>', '</extra_l_id_79>', '</extra_l_id_78>', '</extra_l_id_77>', '</extra_l_id_76>', '</extra_l_id_75>', '</extra_l_id_74>', '</extra_l_id_73>', '</extra_l_id_72>', '</extra_l_id_71>', '</extra_l_id_70>', '</extra_l_id_69>', '</extra_l_id_68>', '</extra_l_id_67>', '</extra_l_id_66>', '</extra_l_id_65>', '</extra_l_id_64>', '</extra_l_id_63>', '</extra_l_id_62>', '</extra_l_id_61>', '</extra_l_id_60>', '</extra_l_id_59>', '</extra_l_id_58>', '</extra_l_id_57>', '</extra_l_id_56>', '</extra_l_id_55>', '</extra_l_id_54>', '</extra_l_id_53>', '</extra_l_id_52>', '</extra_l_id_51>', '</extra_l_id_50>', '</extra_l_id_49>', '</extra_l_id_48>', '</extra_l_id_47>', '</extra_l_id_46>', '</extra_l_id_45>', '</extra_l_id_44>', '</extra_l_id_43>', '</extra_l_id_42>', '</extra_l_id_41>', '</extra_l_id_40>', '</extra_l_id_39>', '</extra_l_id_38>', '</extra_l_id_37>', '</extra_l_id_36>', '</extra_l_id_35>', '</extra_l_id_34>', '</extra_l_id_33>', '</extra_l_id_32>', '</extra_l_id_31>', '</extra_l_id_30>', '</extra_l_id_29>', '</extra_l_id_28>', '</extra_l_id_27>', '</extra_l_id_26>', '</extra_l_id_25>', '</extra_l_id_24>', '</extra_l_id_23>', '</extra_l_id_22>', '</extra_l_id_21>', '</extra_l_id_20>', '</extra_l_id_19>', '</extra_l_id_18>', '</extra_l_id_17>', '</extra_l_id_16>', '</extra_l_id_15>', '</extra_l_id_14>', '</extra_l_id_13>', '</extra_l_id_12>', '</extra_l_id_11>', '</extra_l_id_10>', '</extra_l_id_9>', '</extra_l_id_8>', '</extra_l_id_7>', '</extra_l_id_6>', '</extra_l_id_5>', '</extra_l_id_4>', '</extra_l_id_3>', '</extra_l_id_2>', '</extra_l_id_1>', '</extra_l_id_0>', '<extra_t_id_99>', '<extra_t_id_98>', '<extra_t_id_97>', '<extra_t_id_96>', '<extra_t_id_95>', '<extra_t_id_94>', '<extra_t_id_93>', '<extra_t_id_92>', '<extra_t_id_91>', '<extra_t_id_90>', '<extra_t_id_89>', '<extra_t_id_88>', '<extra_t_id_87>', '<extra_t_id_86>', '<extra_t_id_85>', '<extra_t_id_84>', '<extra_t_id_83>', '<extra_t_id_82>', '<extra_t_id_81>', '<extra_t_id_80>', '<extra_t_id_79>', '<extra_t_id_78>', '<extra_t_id_77>', '<extra_t_id_76>', '<extra_t_id_75>', '<extra_t_id_74>', '<extra_t_id_73>', '<extra_t_id_72>', '<extra_t_id_71>', '<extra_t_id_70>', '<extra_t_id_69>', '<extra_t_id_68>', '<extra_t_id_67>', '<extra_t_id_66>', '<extra_t_id_65>', '<extra_t_id_64>', '<extra_t_id_63>', '<extra_t_id_62>', '<extra_t_id_61>', '<extra_t_id_60>', '<extra_t_id_59>', '<extra_t_id_58>', '<extra_t_id_57>', '<extra_t_id_56>', '<extra_t_id_55>', '<extra_t_id_54>', '<extra_t_id_53>', '<extra_t_id_52>', '<extra_t_id_51>', '<extra_t_id_50>', '<extra_t_id_49>', '<extra_t_id_48>', '<extra_t_id_47>', '<extra_t_id_46>', '<extra_t_id_45>', '<extra_t_id_44>', '<extra_t_id_43>', '<extra_t_id_42>', '<extra_t_id_41>', '<extra_t_id_40>', '<extra_t_id_39>', '<extra_t_id_38>', '<extra_t_id_37>', '<extra_t_id_36>', '<extra_t_id_35>', '<extra_t_id_34>', '<extra_t_id_33>', '<extra_t_id_32>', '<extra_t_id_31>', '<extra_t_id_30>', '<extra_t_id_29>', '<extra_t_id_28>', '<extra_t_id_27>', '<extra_t_id_26>', '<extra_t_id_25>', '<extra_t_id_24>', '<extra_t_id_23>', '<extra_t_id_22>', '<extra_t_id_21>', '<extra_t_id_20>', '<extra_t_id_19>', '<extra_t_id_18>', '<extra_t_id_17>', '<extra_t_id_16>', '<extra_t_id_15>', '<extra_t_id_14>', '<extra_t_id_13>', '<extra_t_id_12>', '<extra_t_id_11>', '<extra_t_id_10>', '<extra_t_id_9>', '<extra_t_id_8>', '<extra_t_id_7>', '<extra_t_id_6>', '<extra_t_id_5>', '<extra_t_id_4>', '<extra_t_id_3>', '<extra_t_id_2>', '<extra_t_id_1>', '<extra_t_id_0>', '</extra_t_id_99>', '</extra_t_id_98>', '</extra_t_id_97>', '</extra_t_id_96>', '</extra_t_id_95>', '</extra_t_id_94>', '</extra_t_id_93>', '</extra_t_id_92>', '</extra_t_id_91>', '</extra_t_id_90>', '</extra_t_id_89>', '</extra_t_id_88>', '</extra_t_id_87>', '</extra_t_id_86>', '</extra_t_id_85>', '</extra_t_id_84>', '</extra_t_id_83>', '</extra_t_id_82>', '</extra_t_id_81>', '</extra_t_id_80>', '</extra_t_id_79>', '</extra_t_id_78>', '</extra_t_id_77>', '</extra_t_id_76>', '</extra_t_id_75>', '</extra_t_id_74>', '</extra_t_id_73>', '</extra_t_id_72>', '</extra_t_id_71>', '</extra_t_id_70>', '</extra_t_id_69>', '</extra_t_id_68>', '</extra_t_id_67>', '</extra_t_id_66>', '</extra_t_id_65>', '</extra_t_id_64>', '</extra_t_id_63>', '</extra_t_id_62>', '</extra_t_id_61>', '</extra_t_id_60>', '</extra_t_id_59>', '</extra_t_id_58>', '</extra_t_id_57>', '</extra_t_id_56>', '</extra_t_id_55>', '</extra_t_id_54>', '</extra_t_id_53>', '</extra_t_id_52>', '</extra_t_id_51>', '</extra_t_id_50>', '</extra_t_id_49>', '</extra_t_id_48>', '</extra_t_id_47>', '</extra_t_id_46>', '</extra_t_id_45>', '</extra_t_id_44>', '</extra_t_id_43>', '</extra_t_id_42>', '</extra_t_id_41>', '</extra_t_id_40>', '</extra_t_id_39>', '</extra_t_id_38>', '</extra_t_id_37>', '</extra_t_id_36>', '</extra_t_id_35>', '</extra_t_id_34>', '</extra_t_id_33>', '</extra_t_id_32>', '</extra_t_id_31>', '</extra_t_id_30>', '</extra_t_id_29>', '</extra_t_id_28>', '</extra_t_id_27>', '</extra_t_id_26>', '</extra_t_id_25>', '</extra_t_id_24>', '</extra_t_id_23>', '</extra_t_id_22>', '</extra_t_id_21>', '</extra_t_id_20>', '</extra_t_id_19>', '</extra_t_id_18>', '</extra_t_id_17>', '</extra_t_id_16>', '</extra_t_id_15>', '</extra_t_id_14>', '</extra_t_id_13>', '</extra_t_id_12>', '</extra_t_id_11>', '</extra_t_id_10>', '</extra_t_id_9>', '</extra_t_id_8>', '</extra_t_id_7>', '</extra_t_id_6>', '</extra_t_id_5>', '</extra_t_id_4>', '</extra_t_id_3>', '</extra_t_id_2>', '</extra_t_id_1>', '</extra_t_id_0>', '<loc_500>', '<loc_499>', '<loc_498>', '<loc_497>', '<loc_496>', '<loc_495>', '<loc_494>', '<loc_493>', '<loc_492>', '<loc_491>', '<loc_490>', '<loc_489>', '<loc_488>', '<loc_487>', '<loc_486>', '<loc_485>', '<loc_484>', '<loc_483>', '<loc_482>', '<loc_481>', '<loc_480>', '<loc_479>', '<loc_478>', '<loc_477>', '<loc_476>', '<loc_475>', '<loc_474>', '<loc_473>', '<loc_472>', '<loc_471>', '<loc_470>', '<loc_469>', '<loc_468>', '<loc_467>', '<loc_466>', '<loc_465>', '<loc_464>', '<loc_463>', '<loc_462>', '<loc_461>', '<loc_460>', '<loc_459>', '<loc_458>', '<loc_457>', '<loc_456>', '<loc_455>', '<loc_454>', '<loc_453>', '<loc_452>', '<loc_451>', '<loc_450>', '<loc_449>', '<loc_448>', '<loc_447>', '<loc_446>', '<loc_445>', '<loc_444>', '<loc_443>', '<loc_442>', '<loc_441>', '<loc_440>', '<loc_439>', '<loc_438>', '<loc_437>', '<loc_436>', '<loc_435>', '<loc_434>', '<loc_433>', '<loc_432>', '<loc_431>', '<loc_430>', '<loc_429>', '<loc_428>', '<loc_427>', '<loc_426>', '<loc_425>', '<loc_424>', '<loc_423>', '<loc_422>', '<loc_421>', '<loc_420>', '<loc_419>', '<loc_418>', '<loc_417>', '<loc_416>', '<loc_415>', '<loc_414>', '<loc_413>', '<loc_412>', '<loc_411>', '<loc_410>', '<loc_409>', '<loc_408>', '<loc_407>', '<loc_406>', '<loc_405>', '<loc_404>', '<loc_403>', '<loc_402>', '<loc_401>', '<loc_400>', '<loc_399>', '<loc_398>', '<loc_397>', '<loc_396>', '<loc_395>', '<loc_394>', '<loc_393>', '<loc_392>', '<loc_391>', '<loc_390>', '<loc_389>', '<loc_388>', '<loc_387>', '<loc_386>', '<loc_385>', '<loc_384>', '<loc_383>', '<loc_382>', '<loc_381>', '<loc_380>', '<loc_379>', '<loc_378>', '<loc_377>', '<loc_376>', '<loc_375>', '<loc_374>', '<loc_373>', '<loc_372>', '<loc_371>', '<loc_370>', '<loc_369>', '<loc_368>', '<loc_367>', '<loc_366>', '<loc_365>', '<loc_364>', '<loc_363>', '<loc_362>', '<loc_361>', '<loc_360>', '<loc_359>', '<loc_358>', '<loc_357>', '<loc_356>', '<loc_355>', '<loc_354>', '<loc_353>', '<loc_352>', '<loc_351>', '<loc_350>', '<loc_349>', '<loc_348>', '<loc_347>', '<loc_346>', '<loc_345>', '<loc_344>', '<loc_343>', '<loc_342>', '<loc_341>', '<loc_340>', '<loc_339>', '<loc_338>', '<loc_337>', '<loc_336>', '<loc_335>', '<loc_334>', '<loc_333>', '<loc_332>', '<loc_331>', '<loc_330>', '<loc_329>', '<loc_328>', '<loc_327>', '<loc_326>', '<loc_325>', '<loc_324>', '<loc_323>', '<loc_322>', '<loc_321>', '<loc_320>', '<loc_319>', '<loc_318>', '<loc_317>', '<loc_316>', '<loc_315>', '<loc_314>', '<loc_313>', '<loc_312>', '<loc_311>', '<loc_310>', '<loc_309>', '<loc_308>', '<loc_307>', '<loc_306>', '<loc_305>', '<loc_304>', '<loc_303>', '<loc_302>', '<loc_301>', '<loc_300>', '<loc_299>', '<loc_298>', '<loc_297>', '<loc_296>', '<loc_295>', '<loc_294>', '<loc_293>', '<loc_292>', '<loc_291>', '<loc_290>', '<loc_289>', '<loc_288>', '<loc_287>', '<loc_286>', '<loc_285>', '<loc_284>', '<loc_283>', '<loc_282>', '<loc_281>', '<loc_280>', '<loc_279>', '<loc_278>', '<loc_277>', '<loc_276>', '<loc_275>', '<loc_274>', '<loc_273>', '<loc_272>', '<loc_271>', '<loc_270>', '<loc_269>', '<loc_268>', '<loc_267>', '<loc_266>', '<loc_265>', '<loc_264>', '<loc_263>', '<loc_262>', '<loc_261>', '<loc_260>', '<loc_259>', '<loc_258>', '<loc_257>', '<loc_256>', '<loc_255>', '<loc_254>', '<loc_253>', '<loc_252>', '<loc_251>', '<loc_250>', '<loc_249>', '<loc_248>', '<loc_247>', '<loc_246>', '<loc_245>', '<loc_244>', '<loc_243>', '<loc_242>', '<loc_241>', '<loc_240>', '<loc_239>', '<loc_238>', '<loc_237>', '<loc_236>', '<loc_235>', '<loc_234>', '<loc_233>', '<loc_232>', '<loc_231>', '<loc_230>', '<loc_229>', '<loc_228>', '<loc_227>', '<loc_226>', '<loc_225>', '<loc_224>', '<loc_223>', '<loc_222>', '<loc_221>', '<loc_220>', '<loc_219>', '<loc_218>', '<loc_217>', '<loc_216>', '<loc_215>', '<loc_214>', '<loc_213>', '<loc_212>', '<loc_211>', '<loc_210>', '<loc_209>', '<loc_208>', '<loc_207>', '<loc_206>', '<loc_205>', '<loc_204>', '<loc_203>', '<loc_202>', '<loc_201>', '<loc_200>', '<loc_199>', '<loc_198>', '<loc_197>', '<loc_196>', '<loc_195>', '<loc_194>', '<loc_193>', '<loc_192>', '<loc_191>', '<loc_190>', '<loc_189>', '<loc_188>', '<loc_187>', '<loc_186>', '<loc_185>', '<loc_184>', '<loc_183>', '<loc_182>', '<loc_181>', '<loc_180>', '<loc_179>', '<loc_178>', '<loc_177>', '<loc_176>', '<loc_175>', '<loc_174>', '<loc_173>', '<loc_172>', '<loc_171>', '<loc_170>', '<loc_169>', '<loc_168>', '<loc_167>', '<loc_166>', '<loc_165>', '<loc_164>', '<loc_163>', '<loc_162>', '<loc_161>', '<loc_160>', '<loc_159>', '<loc_158>', '<loc_157>', '<loc_156>', '<loc_155>', '<loc_154>', '<loc_153>', '<loc_152>', '<loc_151>', '<loc_150>', '<loc_149>', '<loc_148>', '<loc_147>', '<loc_146>', '<loc_145>', '<loc_144>', '<loc_143>', '<loc_142>', '<loc_141>', '<loc_140>', '<loc_139>', '<loc_138>', '<loc_137>', '<loc_136>', '<loc_135>', '<loc_134>', '<loc_133>', '<loc_132>', '<loc_131>', '<loc_130>', '<loc_129>', '<loc_128>', '<loc_127>', '<loc_126>', '<loc_125>', '<loc_124>', '<loc_123>', '<loc_122>', '<loc_121>', '<loc_120>', '<loc_119>', '<loc_118>', '<loc_117>', '<loc_116>', '<loc_115>', '<loc_114>', '<loc_113>', '<loc_112>', '<loc_111>', '<loc_110>', '<loc_109>', '<loc_108>', '<loc_107>', '<loc_106>', '<loc_105>', '<loc_104>', '<loc_103>', '<loc_102>', '<loc_101>', '<loc_100>', '<loc_99>', '<loc_98>', '<loc_97>', '<loc_96>', '<loc_95>', '<loc_94>', '<loc_93>', '<loc_92>', '<loc_91>', '<loc_90>', '<loc_89>', '<loc_88>', '<loc_87>', '<loc_86>', '<loc_85>', '<loc_84>', '<loc_83>', '<loc_82>', '<loc_81>', '<loc_80>', '<loc_79>', '<loc_78>', '<loc_77>', '<loc_76>', '<loc_75>', '<loc_74>', '<loc_73>', '<loc_72>', '<loc_71>', '<loc_70>', '<loc_69>', '<loc_68>', '<loc_67>', '<loc_66>', '<loc_65>', '<loc_64>', '<loc_63>', '<loc_62>', '<loc_61>', '<loc_60>', '<loc_59>', '<loc_58>', '<loc_57>', '<loc_56>', '<loc_55>', '<loc_54>', '<loc_53>', '<loc_52>', '<loc_51>', '<loc_50>', '<loc_49>', '<loc_48>', '<loc_47>', '<loc_46>', '<loc_45>', '<loc_44>', '<loc_43>', '<loc_42>', '<loc_41>', '<loc_40>', '<loc_39>', '<loc_38>', '<loc_37>', '<loc_36>', '<loc_35>', '<loc_34>', '<loc_33>', '<loc_32>', '<loc_31>', '<loc_30>', '<loc_29>', '<loc_28>', '<loc_27>', '<loc_26>', '<loc_25>', '<loc_24>', '<loc_23>', '<loc_22>', '<loc_21>', '<loc_20>', '<loc_19>', '<loc_18>', '<loc_17>', '<loc_16>', '<loc_15>', '<loc_14>', '<loc_13>', '<loc_12>', '<loc_11>', '<loc_10>', '<loc_9>', '<loc_8>', '<loc_7>', '<loc_6>', '<loc_5>', '<loc_4>', '<loc_3>', '<loc_2>', '<loc_1>', '<loc_0>', '<other_199>', '<other_198>', '<other_197>', '<other_196>', '<other_195>', '<other_194>', '<other_193>', '<other_192>', '<other_191>', '<other_190>', '<other_189>', '<other_188>', '<other_187>', '<other_186>', '<other_185>', '<other_184>', '<other_183>', '<other_182>', '<other_181>', '<other_180>', '<other_179>', '<other_178>', '<other_177>', '<other_176>', '<other_175>', '<other_174>', '<other_173>', '<other_172>', '<other_171>', '<other_170>', '<other_169>', '<other_168>', '<other_167>', '<other_166>', '<other_165>', '<other_164>', '<other_163>', '<other_162>', '<other_161>', '<other_160>', '<other_159>', '<other_158>', '<other_157>', '<other_156>', '<other_155>', '<other_154>', '<other_153>', '<other_152>', '<other_151>', '<other_150>', '<other_149>', '<other_148>', '<other_147>', '<other_146>', '<other_145>', '<other_144>', '<other_143>', '<other_142>', '<other_141>', '<other_140>', '<other_139>', '<other_138>', '<other_137>', '<other_136>', '<other_135>', '<other_134>', '<other_133>', '<other_132>', '<other_131>', '<other_130>', '<other_129>', '<other_128>', '<other_127>', '<other_126>', '<other_125>', '<other_124>', '<other_123>', '<other_122>', '<other_121>', '<other_120>', '<other_119>', '<other_118>', '<other_117>', '<other_116>', '<other_115>', '<other_114>', '<other_113>', '<other_112>', '<other_111>', '<other_110>', '<other_109>', '<other_108>', '<other_107>', '<other_106>', '<other_105>', '<other_104>', '<other_103>', '<other_102>', '<other_101>', '<other_100>', '<other_99>', '<other_98>', '<other_97>', '<other_96>', '<other_95>', '<other_94>', '<other_93>', '<other_92>', '<other_91>', '<other_90>', '<other_89>', '<other_88>', '<other_87>', '<other_86>', '<other_85>', '<other_84>', '<other_83>', '<other_82>', '<other_81>', '<other_80>', '<other_79>', '<other_78>', '<other_77>', '<other_76>', '<other_75>', '<other_74>', '<other_73>', '<other_72>', '<other_71>', '<other_70>', '<other_69>', '<other_68>', '<other_67>', '<other_66>', '<other_65>', '<other_64>', '<other_63>', '<other_62>', '<other_61>', '<other_60>', '<other_59>', '<other_58>', '<other_57>', '<other_56>', '<other_55>', '<other_54>', '<other_53>', '<other_52>', '<other_51>', '<other_50>', '<other_49>', '<other_48>', '<other_47>', '<other_46>', '<other_45>', '<other_44>', '<other_43>', '<other_42>', '<other_41>', '<other_40>', '<other_39>', '<other_38>', '<other_37>', '<other_36>', '<other_35>', '<other_34>', '<other_33>', '<other_32>', '<other_31>', '<other_30>', '<other_29>', '<other_28>', '<other_27>', '<other_26>', '<other_25>', '<other_24>', '<other_23>', '<other_22>', '<other_21>', '<other_20>', '<other_19>', '<other_18>', '<other_17>', '<other_16>', '<other_15>', '<other_14>', '<other_13>', '<other_12>', '<other_11>', '<other_10>', '<other_9>', '<other_8>', '<other_7>', '<other_6>', '<other_5>', '<other_4>', '<other_3>', '<other_2>', '<other_1>', '<other_0>']
-    # fmt: on
-
-    tokenizer = UdopTokenizer.from_pretrained(
-        "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512",
-        legacy=True,
-        additional_special_tokens=additional_special_tokens,
-    )
-    size = {"height": image_size, "width": image_size}
-    image_processor = LayoutLMv3ImageProcessor(
-        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size=size
-    )
-    processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # prepare dummy inputs
-    input_ids, bbox, image = prepare_dummy_inputs(tokenizer, image_processor)
-    prompt = "Question answering. In which year is the report made?"
-    encoding = processor(images=get_image(), text=prompt, return_tensors="pt")
-
-    input_ids = encoding.input_ids
-    try:
-        EXPECTED_INPUT_IDS = torch.tensor([[11860, 18243, 5, 86, 84, 215, 19, 8, 934, 263, 58, 1, 489, 27, 3838, 7363, 4083, 14536, 3430, 5686, 5911, 17161, 134, 2038, 27, 3838, 22, 7, 4688, 7, 10, 389, 18202, 21, 8, 11046, 37, 3733, 523, 11, 38, 2388, 1628, 3, 13133, 23334, 6, 8, 1656, 79, 3806, 21, 4040, 640, 27, 3838, 22, 7, 701, 16534, 6, 8, 3, 76, 2693, 18, 23015, 5644, 24, 380, 3, 6015, 6, 11, 8, 701, 24, 79, 482, 21, 3, 88, 684, 6, 43, 263, 27, 3838, 22, 7, 3635, 1157, 4089, 6, 2651, 12, 1547, 22, 7, 3265, 655, 5, 19, 27, 3838, 22, 7, 38, 2388, 257, 12, 36, 8, 465, 209, 13409, 12150, 1959, 16, 8, 684, 6, 6737, 57, 165, 126, 13409, 12150, 1623, 5, 71, 1100, 30298, 934, 65, 12566, 24, 27, 3838, 31, 7, 126, 13409, 12150, 1623, 33, 8, 10391, 1710, 859, 8, 420, 3733, 4968, 688, 2699, 16, 1547, 5, 27, 3838, 1217, 131, 99, 23, 179, 6064, 24, 6, 590, 28, 3, 11600, 1456, 701, 6, 175, 9443, 2557, 3635, 92, 1262, 8, 3409, 13, 2186, 3, 27908, 1784, 190, 8, 3, 5771, 17, 13281, 4005, 13, 5086, 11, 13066, 1170, 5, 10826, 16309, 134, 3, 2, 276, 26, 3, 55, 391, 13570, 5, 10315, 309, 3577, 19114, 371, 4254, 5121, 5055, 6245, 3, 10047, 3162, 58, 3, 9, 61, 1713, 2703, 476, 667, 25158, 301, 6058, 6038, 476, 3765, 9149, 10, 4893, 1303, 1986, 5, 13580, 7, 8224, 28244, 7, 5, 76, 75, 7, 89, 5, 15, 1259, 87, 7171, 7, 87, 7, 29, 115, 226, 4305, 2773, 1]])  # fmt: skip
-        torch.testing.assert_close(EXPECTED_INPUT_IDS, input_ids)
-        bbox = encoding.bbox.float()
-        pixel_values = encoding.pixel_values
-    except Exception:
-        print("Input_ids don't match, preparing dummy inputs")
-        input_ids, bbox, pixel_values = prepare_dummy_inputs(tokenizer, image_processor)
-
-    # Verify single forward pass
-    print("Testing single forward pass..")
-    with torch.no_grad():
-        decoder_input_ids = torch.tensor([[101]])
-        outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-    # tensor([[-18.5262,   1.5087, -15.7051]]) on linux
-    # tensor([[-19.4976,   0.8515, -17.1873]]) on mac
-    try:
-        assert torch.allclose(outputs.logits[0, :3, :3], torch.tensor([[-18.5262, 1.5087, -15.7051]]), atol=1e-4)
-        print("Looks ok!")
-    except Exception:
-        print("logits don't match let's try to generate")
-
-    # Verify autoregressive decoding
-    print("Testing generation...")
-    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
-    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
-
-    print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-
-    # autoregressive decoding with original input data
-    print("Testing generation with original inputs...")
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="input_ids_udop.pt", repo_type="dataset")
-    input_ids = torch.load(filepath)
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="bbox_udop.pt", repo_type="dataset")
-    bbox = torch.load(filepath)
-    pixel_values_filename = "pixel_values_udop_512.pt" if "512" in model_name else "pixel_values_udop_224.pt"
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=pixel_values_filename, repo_type="dataset")
-    pixel_values = torch.load(filepath)
-
-    print("Decoded input ids:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
-    print("Bbox shape:", bbox.shape)
-
-    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
-    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
-    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-    print("Generated:", generated_text)
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"microsoft/{model_name}")
-        processor.push_to_hub(f"microsoft/{model_name}")
-        # BIG note here: to save the fast tokenizer files in the repo on the hub, you need to do the following:
-        # see https://discuss.huggingface.co/t/convert-slow-xlmrobertatokenizer-to-fast-one/20876
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="udop-large",
-        type=str,
-        choices=["udop-large", "udop-large-512", "udop-large-512-300k"],
-        help=("Name of the UDOP model you'd like to convert."),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_udop_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
deleted file mode 100644
index 848ca3c5660c..000000000000
--- a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Google LLC and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert T5X checkpoint to PyTorch
-
-Steps:
-- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
-- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
-    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
-- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
-    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
-- Convert:
-    ```
-    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
-      --pytorch_dump_path=$HOME/t5_1_1_small_pt
-    ```
-"""
-
-import argparse
-import collections
-
-import numpy as np
-import torch
-from flax import traverse_util
-from t5x import checkpoints
-
-from transformers import MT5Config, UMT5EncoderModel, UMT5ForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def t5x_relpos_bias_lookup(params, i, prefix):
-    """Returns the Relative Position Bias parameters of a layer. Does not transpose."""
-    return params[f"{prefix}/{prefix}/relpos_bias/rel_embedding"][:, i, :]
-
-
-def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
-    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
-    k_tmp = k_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/key/kernel"][:, i, :, :])
-    k = k_tmp.reshape(k_tmp.shape[0], k_tmp.shape[1] * k_tmp.shape[2])
-    o_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/out/kernel"][:, i, :, :])
-    o = o_tmp.reshape(o_tmp.shape[0] * o_tmp.shape[1], o_tmp.shape[2])
-    q_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/query/kernel"][:, i, :, :])
-    q = q_tmp.reshape(q_tmp.shape[0], q_tmp.shape[1] * q_tmp.shape[2])
-    v_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/value/kernel"][:, i, :, :])
-    v = v_tmp.reshape(v_tmp.shape[0], v_tmp.shape[1] * v_tmp.shape[2])
-    return k, o, q, v
-
-
-def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
-    """Returns the MLP parameters of a layer. Does not transpose."""
-    if split_mlp_wi:
-        wi_0 = params[f"{prefix}/{prefix}/mlp/wi_0/kernel"][:, i, :]
-        wi_1 = params[f"{prefix}/{prefix}/mlp/wi_1/kernel"][:, i, :]
-        wi = (wi_0, wi_1)
-    else:
-        wi = params[f"{prefix}/{prefix}/mlp/wi/kernel"][:, i, :]
-
-    wo = params[f"{prefix}/{prefix}/mlp/wo/kernel"][:, i, :]
-    return wi, wo
-
-
-def t5x_layer_norm_lookup(params, i, prefix, layer_name):
-    """Returns the layer norm param of a layer."""
-    return params[f"{prefix}/{prefix}/{layer_name}/scale"][:, i]
-
-
-def convert_t5x_to_pytorch(
-    variables: dict, *, num_layers: int, is_encoder_only: bool, scalable_attention: bool = False
-):
-    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
-    old = traverse_util.flatten_dict(variables["target"])
-    old = {"/".join(k): v for k, v in old.items()}
-
-    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
-    split_mlp_wi = "encoder/encoder/mlp/wi_0/kernel" in old
-    print("Split MLP:", split_mlp_wi)
-
-    new = collections.OrderedDict()
-
-    # Shared embeddings.
-    new["shared.weight"] = old["token_embedder/embedding"]
-
-    # Encoder.
-    for i in range(num_layers):
-        # Block i, layer 0 (Self Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
-        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-        # Block i, layer 1 (MLP).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
-        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
-        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-        if split_mlp_wi:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
-        else:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
-        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
-        if scalable_attention:
-            # convert the rel_embedding of each layer
-            new[f"encoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-                old, i, "encoder"
-            ).T
-
-    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
-
-    if not scalable_attention:
-        new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-            old, 0, "encoder"
-        ).T
-        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-            old, 0, "decoder"
-        ).T
-
-    if not is_encoder_only:
-        # Decoder.
-        for i in range(num_layers):
-            # Block i, layer 0 (Self Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
-            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-            # Block i, layer 1 (Cross Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
-            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
-
-            # Block i, layer 2 (MLP).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
-            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
-            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
-            if split_mlp_wi:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
-            else:
-                new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
-
-            if scalable_attention:
-                # convert the rel_embedding of each layer
-                new[f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = (
-                    t5x_relpos_bias_lookup(old, i, "decoder").T
-                )
-
-        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
-
-        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
-        if "decoder/logits_dense/kernel" in old:
-            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
-
-    return new
-
-
-def make_state_dict(converted_params, is_encoder_only: bool):
-    """Prepares a state dict for the PyTorch model."""
-    # Make a state dict with torch tensors.
-    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
-
-    # Add what is missing.
-    if "encoder.embed_tokens.weight" not in state_dict:
-        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-    if not is_encoder_only:
-        if "decoder.embed_tokens.weight" not in state_dict:
-            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
-            print("Using shared word embeddings as lm_head.")
-            state_dict["lm_head.weight"] = state_dict["shared.weight"]
-
-    return state_dict
-
-
-def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
-    """Replaces the params in model witht the T5X converted params."""
-    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(
-        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
-    )
-    state_dict = make_state_dict(converted, is_encoder_only)
-    model.load_state_dict(state_dict, strict=True)
-
-
-def convert_t5x_checkpoint_to_pytorch(
-    t5x_checkpoint_path,
-    config_file,
-    pytorch_dump_path,
-    is_encoder_only: bool = False,
-    scalable_attention: bool = False,
-):
-    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
-    # Initialise PyTorch model
-    config = MT5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
-    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
-    if is_encoder_only:
-        model = UMT5EncoderModel(config)
-    else:
-        model = UMT5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Verify that we can load the checkpoint.
-    model.from_pretrained(pytorch_dump_path)
-    print("Done")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
-    )
-    parser.add_argument(
-        "--scalable_attention",
-        action="store_true",
-        help="Whether the model uses scaled attention (umt5 model)",
-        default=False,
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_pytorch(
-        args.t5x_checkpoint_path,
-        args.config_file,
-        args.pytorch_dump_path,
-        args.is_encoder_only,
-        args.scalable_attention,
-    )
diff --git a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4eb8dfa7bbd2..000000000000
--- a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UniSpeech checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    UniSpeechConfig,
-    UniSpeechForCTC,
-    UniSpeechForPreTraining,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2PhonemeCTCTokenizer,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "ctc_proj",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "ctc_proj",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type, is_finetuned):
-    for attribute in key.split("."):
-        if is_finetuned:
-            if attribute in ["quantizer", "project_q", "project_hid"]:
-                # those layers are only relevant for pretraining and should be dropped
-                return
-
-            if attribute == "ctc_proj":
-                # we should rename `ctc_proj` to `lm_head` for fine-tuned phoneme models
-                attribute = "lm_head"
-
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.unispeech.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "unispeech." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type, is_finetuned)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_unispeech_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = UniSpeechConfig.from_pretrained(config_path)
-    else:
-        config = UniSpeechConfig()
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load_from_json(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 42
-            vocab_dict["<s>"] = 43
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2PhonemeCTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_unispeech = UniSpeechForCTC(config)
-    else:
-        hf_unispeech = UniSpeechForPreTraining(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_unispeech, is_finetuned)
-
-    hf_unispeech.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_unispeech_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index fca35acb634d..000000000000
--- a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    UniSpeechSatConfig,
-    UniSpeechSatForAudioFrameClassification,
-    UniSpeechSatForSequenceClassification,
-    UniSpeechSatForXVector,
-    Wav2Vec2FeatureExtractor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = UniSpeechSatConfig.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4a70d41dd282..000000000000
--- a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UniSpeechSat checkpoint."""
-
-import argparse
-
-import fairseq
-import torch
-
-from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "encoder.layer_norm_for_extract": "layer_norm_for_extract",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "label_embs_concat": "label_embeddings_concat",
-    "mask_emb": "masked_spec_embed",
-    "spk_proj": "speaker_proj",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-    "label_embeddings_concat",
-    "speaker_proj",
-    "layer_norm_for_extract",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.unispeech_sat.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "unispeech_sat." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    if "layer_norm_for_extract" in name and (".".join(name.split(".")[:-1]) != key):
-                        # special case since naming is very similar
-                        continue
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_unispeech_sat_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = UniSpeechSatConfig.from_pretrained(config_path)
-    else:
-        config = UniSpeechSatConfig()
-
-    dict_path = ""
-
-    if is_finetuned:
-        hf_wav2vec = UniSpeechSatForCTC(config)
-    else:
-        hf_wav2vec = UniSpeechSatForPreTraining(config)
-
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-    )
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_unispeech_sat_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/univnet/convert_univnet.py b/src/transformers/models/univnet/convert_univnet.py
deleted file mode 100644
index 30520b7fa147..000000000000
--- a/src/transformers/models/univnet/convert_univnet.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import UnivNetConfig, UnivNetModel, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.univnet")
-
-
-def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = "", new_prefix: str = ""):
-    mapping = {}
-    # Initial conv layer
-    mapping[f"{old_prefix}.input_conv.0.weight_g"] = f"{new_prefix}.input_conv.weight_g"
-    mapping[f"{old_prefix}.input_conv.0.weight_v"] = f"{new_prefix}.input_conv.weight_v"
-    mapping[f"{old_prefix}.input_conv.0.bias"] = f"{new_prefix}.input_conv.bias"
-
-    # Kernel predictor resnet blocks
-    for i in range(config.kernel_predictor_num_blocks):
-        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_g"] = f"{new_prefix}.resblocks.{i}.conv1.weight_g"
-        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_v"] = f"{new_prefix}.resblocks.{i}.conv1.weight_v"
-        mapping[f"{old_prefix}.residual_convs.{i}.1.bias"] = f"{new_prefix}.resblocks.{i}.conv1.bias"
-
-        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_g"] = f"{new_prefix}.resblocks.{i}.conv2.weight_g"
-        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_v"] = f"{new_prefix}.resblocks.{i}.conv2.weight_v"
-        mapping[f"{old_prefix}.residual_convs.{i}.3.bias"] = f"{new_prefix}.resblocks.{i}.conv2.bias"
-
-    # Kernel output conv
-    mapping[f"{old_prefix}.kernel_conv.weight_g"] = f"{new_prefix}.kernel_conv.weight_g"
-    mapping[f"{old_prefix}.kernel_conv.weight_v"] = f"{new_prefix}.kernel_conv.weight_v"
-    mapping[f"{old_prefix}.kernel_conv.bias"] = f"{new_prefix}.kernel_conv.bias"
-
-    # Bias output conv
-    mapping[f"{old_prefix}.bias_conv.weight_g"] = f"{new_prefix}.bias_conv.weight_g"
-    mapping[f"{old_prefix}.bias_conv.weight_v"] = f"{new_prefix}.bias_conv.weight_v"
-    mapping[f"{old_prefix}.bias_conv.bias"] = f"{new_prefix}.bias_conv.bias"
-
-    return mapping
-
-
-def get_key_mapping(config: UnivNetConfig):
-    mapping = {}
-
-    # NOTE: inital conv layer keys are the same
-
-    # LVC Residual blocks
-    for i in range(len(config.resblock_stride_sizes)):
-        # LVCBlock initial convt layer
-        mapping[f"res_stack.{i}.convt_pre.1.weight_g"] = f"resblocks.{i}.convt_pre.weight_g"
-        mapping[f"res_stack.{i}.convt_pre.1.weight_v"] = f"resblocks.{i}.convt_pre.weight_v"
-        mapping[f"res_stack.{i}.convt_pre.1.bias"] = f"resblocks.{i}.convt_pre.bias"
-
-        # Kernel predictor
-        kernel_predictor_mapping = get_kernel_predictor_key_mapping(
-            config, old_prefix=f"res_stack.{i}.kernel_predictor", new_prefix=f"resblocks.{i}.kernel_predictor"
-        )
-        mapping.update(kernel_predictor_mapping)
-
-        # LVC Residual blocks
-        for j in range(len(config.resblock_dilation_sizes[i])):
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_g"] = f"resblocks.{i}.resblocks.{j}.conv.weight_g"
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_v"] = f"resblocks.{i}.resblocks.{j}.conv.weight_v"
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.bias"] = f"resblocks.{i}.resblocks.{j}.conv.bias"
-
-    # Output conv layer
-    mapping["conv_post.1.weight_g"] = "conv_post.weight_g"
-    mapping["conv_post.1.weight_v"] = "conv_post.weight_v"
-    mapping["conv_post.1.bias"] = "conv_post.bias"
-
-    return mapping
-
-
-def rename_state_dict(state_dict, keys_to_modify, keys_to_remove):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        if key in keys_to_remove:
-            continue
-
-        if key in keys_to_modify:
-            new_key = keys_to_modify[key]
-            model_state_dict[new_key] = value
-        else:
-            model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_univnet_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-    safe_serialization=False,
-):
-    model_state_dict_base = torch.load(checkpoint_path, map_location="cpu")
-    # Get the generator's state dict
-    state_dict = model_state_dict_base["model_g"]
-
-    if config_path is not None:
-        config = UnivNetConfig.from_pretrained(config_path)
-    else:
-        config = UnivNetConfig()
-
-    keys_to_modify = get_key_mapping(config)
-    keys_to_remove = set()
-    hf_state_dict = rename_state_dict(state_dict, keys_to_modify, keys_to_remove)
-
-    model = UnivNetModel(config)
-    # Apply weight norm since the original checkpoint has weight norm applied
-    model.apply_weight_norm()
-    model.load_state_dict(hf_state_dict)
-    # Remove weight norm in preparation for inference
-    model.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument(
-        "--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
-    )
-
-    args = parser.parse_args()
-
-    convert_univnet_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-        args.safe_serialization,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
deleted file mode 100644
index eeb3ab5fc993..000000000000
--- a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNext + UperNet checkpoints from mmsegmentation."""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation
-
-
-def get_upernet_config(model_name):
-    auxiliary_in_channels = 384
-    if "tiny" in model_name:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "small" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-        auxiliary_in_channels = 512
-    if "large" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-        auxiliary_in_channels = 768
-    if "xlarge" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [256, 512, 1024, 2048]
-        auxiliary_in_channels = 1024
-
-    # set label information
-    num_labels = 150
-    repo_id = "huggingface/label-files"
-    filename = "ade20k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    backbone_config = ConvNextConfig(
-        depths=depths, hidden_sizes=hidden_sizes, out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-    config = UperNetConfig(
-        backbone_config=backbone_config,
-        auxiliary_in_channels=auxiliary_in_channels,
-        num_labels=num_labels,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.downsample_layers.0.0.weight", "backbone.embeddings.patch_embeddings.weight"))
-    rename_keys.append(("backbone.downsample_layers.0.0.bias", "backbone.embeddings.patch_embeddings.bias"))
-    rename_keys.append(("backbone.downsample_layers.0.1.weight", "backbone.embeddings.layernorm.weight"))
-    rename_keys.append(("backbone.downsample_layers.0.1.bias", "backbone.embeddings.layernorm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.stages.{i}.{j}.gamma", f"backbone.encoder.stages.{i}.layers.{j}.layer_scale_parameter"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.weight", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.bias", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.norm.weight", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.norm.bias", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.bias"))
-        if i > 0:
-            rename_keys.append((f"backbone.downsample_layers.{i}.0.weight", f"backbone.encoder.stages.{i}.downsampling_layer.0.weight"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.0.bias", f"backbone.encoder.stages.{i}.downsampling_layer.0.bias"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.1.weight", f"backbone.encoder.stages.{i}.downsampling_layer.1.weight"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.1.bias", f"backbone.encoder.stages.{i}.downsampling_layer.1.bias"))
-
-        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
-
-    # decode head
-    rename_keys.extend(
-        [
-            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-        ]
-    )
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    model_name_to_url = {
-        "upernet-convnext-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth",
-        "upernet-convnext-small": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth",
-        "upernet-convnext-base": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth",
-        "upernet-convnext-large": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth",
-        "upernet-convnext-xlarge": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth",
-    }
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-
-    config = get_upernet_config(model_name)
-    model = UperNetForSemanticSegmentation(config)
-    model.eval()
-
-    # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    model.load_state_dict(state_dict)
-
-    # verify on image
-    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    processor = SegformerImageProcessor()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    if model_name == "upernet-convnext-tiny":
-        expected_slice = torch.tensor(
-            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
-        )
-    elif model_name == "upernet-convnext-small":
-        expected_slice = torch.tensor(
-            [[-8.8236, -8.8236, -8.6771], [-8.8236, -8.8236, -8.6771], [-8.7638, -8.7638, -8.6240]]
-        )
-    elif model_name == "upernet-convnext-base":
-        expected_slice = torch.tensor(
-            [[-8.8558, -8.8558, -8.6905], [-8.8558, -8.8558, -8.6905], [-8.7669, -8.7669, -8.6021]]
-        )
-    elif model_name == "upernet-convnext-large":
-        expected_slice = torch.tensor(
-            [[-8.6660, -8.6660, -8.6210], [-8.6660, -8.6660, -8.6210], [-8.6310, -8.6310, -8.5964]]
-        )
-    elif model_name == "upernet-convnext-xlarge":
-        expected_slice = torch.tensor(
-            [[-8.4980, -8.4980, -8.3977], [-8.4980, -8.4980, -8.3977], [-8.4379, -8.4379, -8.3412]]
-        )
-    print("Logits:", outputs.logits[0, 0, :3, :3])
-    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"openmmlab/{model_name}")
-        processor.push_to_hub(f"openmmlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="upernet-convnext-tiny",
-        type=str,
-        choices=[f"upernet-convnext-{size}" for size in ["tiny", "small", "base", "large", "xlarge"]],
-        help="Name of the ConvNext UperNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
deleted file mode 100644
index 9580af7c46a5..000000000000
--- a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin Transformer + UperNet checkpoints from mmsegmentation.
-
-URL: https://github.com/open-mmlab/mmsegmentation/tree/master/configs/swin
-"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
-
-
-def get_upernet_config(model_name):
-    auxiliary_in_channels = 384
-    window_size = 7
-    if "tiny" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif "small" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif "base" in model_name:
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        window_size = 12
-        auxiliary_in_channels = 512
-    elif "large" in model_name:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-        window_size = 12
-        auxiliary_in_channels = 768
-
-    # set label information
-    num_labels = 150
-    repo_id = "huggingface/label-files"
-    filename = "ade20k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    backbone_config = SwinConfig(
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        window_size=window_size,
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-    )
-    config = UperNetConfig(
-        backbone_config=backbone_config,
-        auxiliary_in_channels=auxiliary_in_channels,
-        num_labels=num_labels,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.patch_embed.projection.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.projection.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
-    rename_keys.append(("backbone.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_bias_table", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_index", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.stages.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.stages.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.stages.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
-        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
-
-    # decode head
-    rename_keys.extend(
-        [
-            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-        ]
-    )
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-def correct_unfold_reduction_order(x):
-    out_channel, in_channel = x.shape
-    x = x.reshape(out_channel, 4, in_channel // 4)
-    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
-    return x
-
-
-def reverse_correct_unfold_reduction_order(x):
-    out_channel, in_channel = x.shape
-    x = x.reshape(out_channel, in_channel // 4, 4)
-    x = x[:, :, [0, 2, 1, 3]].transpose(1, 2).reshape(out_channel, in_channel)
-
-    return x
-
-
-def correct_unfold_norm_order(x):
-    in_channel = x.shape[0]
-    x = x.reshape(4, in_channel // 4)
-    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
-    return x
-
-
-# there was an incompatibility with this version, due to a new implementation of their downsampling operation using nn.Unfold.
-# was resolved as seen here:
-# https://github.com/open-mmlab/mmdetection/blob/31c84958f54287a8be2b99cbf87a6dcf12e57753/mmdet/models/utils/ckpt_convert.py#L96.
-def reverse_correct_unfold_norm_order(x):
-    in_channel = x.shape[0]
-    x = x.reshape(in_channel // 4, 4)
-    x = x[:, [0, 2, 1, 3]].transpose(0, 1).reshape(in_channel)
-    return x
-
-
-def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    model_name_to_url = {
-        "upernet-swin-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth",
-        "upernet-swin-small": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth",
-        "upernet-swin-base": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth",
-        "upernet-swin-large": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k_20220318_091743-9ba68901.pth",
-    }
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[
-        "state_dict"
-    ]
-
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    config = get_upernet_config(model_name)
-    model = UperNetForSemanticSegmentation(config)
-    model.eval()
-
-    # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config.backbone_config)
-
-    # fix downsample parameters
-    for key, value in state_dict.items():
-        if "downsample" in key:
-            if "reduction" in key:
-                state_dict[key] = reverse_correct_unfold_reduction_order(value)
-            if "norm" in key:
-                state_dict[key] = reverse_correct_unfold_norm_order(value)
-
-    model.load_state_dict(state_dict)
-
-    # verify on image
-    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    processor = SegformerImageProcessor()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print(logits.shape)
-    print("First values of logits:", logits[0, 0, :3, :3])
-    # assert values
-    if model_name == "upernet-swin-tiny":
-        expected_slice = torch.tensor(
-            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
-        )
-    elif model_name == "upernet-swin-small":
-        expected_slice = torch.tensor(
-            [[-7.1921, -7.1921, -6.9532], [-7.1921, -7.1921, -6.9532], [-7.0908, -7.0908, -6.8534]]
-        )
-    elif model_name == "upernet-swin-base":
-        expected_slice = torch.tensor(
-            [[-6.5851, -6.5851, -6.4330], [-6.5851, -6.5851, -6.4330], [-6.4763, -6.4763, -6.3254]]
-        )
-    elif model_name == "upernet-swin-large":
-        expected_slice = torch.tensor(
-            [[-7.5297, -7.5297, -7.3802], [-7.5297, -7.5297, -7.3802], [-7.4044, -7.4044, -7.2586]]
-        )
-    print("Logits:", outputs.logits[0, 0, :3, :3])
-    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"openmmlab/{model_name}")
-        processor.push_to_hub(f"openmmlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="upernet-swin-tiny",
-        type=str,
-        choices=[f"upernet-swin-{size}" for size in ["tiny", "small", "base", "large"]],
-        help="Name of the Swin + UperNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py b/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py
deleted file mode 100644
index 4c07ca0a03a7..000000000000
--- a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    VideoLlavaConfig,
-    VideoLlavaForConditionalGeneration,
-    VideoLlavaImageProcessor,
-    VideoLlavaProcessor,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14 --output_hub_path org/video_llava-7b --old_state_dict_id LanguageBind/Video-LLaVA-7B
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from video_llava.model.language_model.video_llava import VideoLlavaForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/video_llava-7b/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.video_tower.video_tower": "video_tower",
-    "model.image_tower.image_tower": "image_tower",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "language_model.model",
-    "lm_head": "language_model.lm_head",
-    "video_tower": "video_tower.vision_model",
-    "image_tower": "image_tower.vision_model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-}
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_video_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    tokenizer.padding_side = "left"
-
-    image_processor = VideoLlavaImageProcessor.from_pretrained(vision_model_id)
-
-    processor = VideoLlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = VideoLlavaConfig(text_config=text_config)
-    config.pad_token_id = 32002
-
-    with torch.device("meta"):
-        model = VideoLlavaForConditionalGeneration(config)
-
-    model_state_dict = set(model.state_dict().keys())
-
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-    state_dict_temp = "pytorch_model-0000{i}-of-00002.bin"
-    for shard in range(1, 3):
-        state_dict_path = hf_hub_download(old_state_dict_id, state_dict_temp.format(i=shard))
-        state_dict = torch.load(state_dict_path, map_location="cpu")
-        state_dict = convert_state_dict_to_hf(state_dict)
-        model.load_state_dict(state_dict, strict=False, assign=True)
-        model_state_dict -= set(state_dict.keys())
-
-    if len(model_state_dict) > 0:
-        raise RuntimeError(f"Missing keys in state dict: {model_state_dict}")
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image and video token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 3, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_video_llava_llama_to_hf(
-        args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
deleted file mode 100644
index c98160a6bb82..000000000000
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VideoMAE checkpoints from the original repository: https://github.com/MCG-NJU/VideoMAE"""
-
-import argparse
-import json
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    VideoMAEConfig,
-    VideoMAEForPreTraining,
-    VideoMAEForVideoClassification,
-    VideoMAEImageProcessor,
-)
-
-
-def get_videomae_config(model_name):
-    config = VideoMAEConfig()
-
-    set_architecture_configs(model_name, config)
-
-    if "finetuned" not in model_name:
-        config.use_mean_pooling = False
-
-    if "finetuned" in model_name:
-        repo_id = "huggingface/label-files"
-        if "kinetics" in model_name:
-            config.num_labels = 400
-            filename = "kinetics400-id2label.json"
-        elif "ssv2" in model_name:
-            config.num_labels = 174
-            filename = "something-something-v2-id2label.json"
-        else:
-            raise ValueError("Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned.")
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def set_architecture_configs(model_name, config):
-    if "small" in model_name:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 3
-        config.decoder_hidden_size = 192
-        config.decoder_intermediate_size = 768
-    elif "large" in model_name:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 8
-        config.decoder_hidden_size = 512
-        config.decoder_intermediate_size = 2048
-    elif "huge" in model_name:
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 8
-        config.decoder_hidden_size = 640
-        config.decoder_intermediate_size = 2560
-    elif "base" not in model_name:
-        raise ValueError('Model name should include either "small", "base", "large", or "huge"')
-
-
-def rename_key(name):
-    if "encoder." in name:
-        name = name.replace("encoder.", "")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "videomae.embeddings.cls_token")
-    if "decoder_pos_embed" in name:
-        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
-    if "pos_embed" in name and "decoder" not in name:
-        name = name.replace("pos_embed", "videomae.embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
-    if "decoder.blocks" in name:
-        name = name.replace("decoder.blocks", "decoder.decoder_layers")
-    if "blocks" in name:
-        name = name.replace("blocks", "videomae.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name and "bias" not in name:
-        name = name.replace("attn", "attention.self")
-    if "attn" in name:
-        name = name.replace("attn", "attention.attention")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "decoder_embed" in name:
-        name = name.replace("decoder_embed", "decoder.decoder_embed")
-    if "decoder_norm" in name:
-        name = name.replace("decoder_norm", "decoder.decoder_norm")
-    if "decoder_pred" in name:
-        name = name.replace("decoder_pred", "decoder.decoder_pred")
-    if "norm.weight" in name and "decoder" not in name and "fc" not in name:
-        name = name.replace("norm.weight", "videomae.layernorm.weight")
-    if "norm.bias" in name and "decoder" not in name and "fc" not in name:
-        name = name.replace("norm.bias", "videomae.layernorm.bias")
-    if "head" in name and "decoder" not in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("encoder."):
-            key = key.replace("encoder.", "")
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            if key.startswith("decoder.blocks"):
-                dim = config.decoder_hidden_size
-                layer_num = int(key_split[2])
-                prefix = "decoder.decoder_layers."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-            else:
-                dim = config.hidden_size
-                layer_num = int(key_split[1])
-                prefix = "videomae.encoder.layer."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
-    config = get_videomae_config(model_name)
-
-    if "finetuned" in model_name:
-        model = VideoMAEForVideoClassification(config)
-    else:
-        model = VideoMAEForPreTraining(config)
-
-    # download original checkpoint, hosted on Google Drive
-    output = "pytorch_model.bin"
-    gdown.cached_download(checkpoint_url, output, quiet=False)
-    files = torch.load(output, map_location="cpu")
-    if "model" in files:
-        state_dict = files["model"]
-    else:
-        state_dict = files["module"]
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    # verify model on basic input
-    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-    video = prepare_video()
-    inputs = image_processor(video, return_tensors="pt")
-
-    if "finetuned" not in model_name:
-        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
-        inputs["bool_masked_pos"] = torch.load(local_path)
-
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    model_names = [
-        "videomae-small-finetuned-kinetics",
-        "videomae-small-finetuned-ssv2",
-        # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
-        "videomae-base-short",
-        "videomae-base-short-finetuned-kinetics",
-        "videomae-base",
-        "videomae-base-finetuned-kinetics",
-        "videomae-large",
-        "videomae-large-finetuned-kinetics",
-        "videomae-huge-finetuned-kinetics",
-        # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
-        "videomae-base-short-ssv2",
-        "videomae-base-short-finetuned-ssv2",
-        "videomae-base-ssv2",
-        "videomae-base-finetuned-ssv2",
-    ]
-
-    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
-    if model_name == "videomae-small-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
-    elif model_name == "videomae-small-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
-    elif model_name == "videomae-base":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
-    elif model_name == "videomae-base-short":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
-        # we verified the loss both for normalized and unnormalized targets for this one
-        expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
-    elif model_name == "videomae-large":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]])
-    elif model_name == "videomae-large-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
-    elif model_name == "videomae-huge-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
-    elif model_name == "videomae-base-short-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
-    elif model_name == "videomae-base-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
-    elif model_name == "videomae-base-short-ssv2":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]])
-    elif model_name == "videomae-base-short-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
-    elif model_name == "videomae-base-ssv2":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]])
-    elif model_name == "videomae-base-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
-    else:
-        raise ValueError(f"Model name not supported. Should be one of {model_names}")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    if "finetuned" in model_name:
-        assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
-    else:
-        print("Logits:", logits[0, :3, :3])
-        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
-    print("Logits ok!")
-
-    # verify loss, if applicable
-    if model_name == "videomae-base-short":
-        loss = outputs.loss
-        assert torch.allclose(loss, expected_loss, atol=1e-4)
-        print("Loss ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        model.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://drive.google.com/u/1/uc?id=1tEhLyskjb755TJ65ptsrafUG2llSwQE1&amp;export=download&amp;confirm=t&amp;uuid=aa3276eb-fb7e-482a-adec-dc7171df14c4",
-        type=str,
-        help=(
-            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
-            " download link."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/Users/nielsrogge/Documents/VideoMAE/Test",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
deleted file mode 100644
index 79b9f3ba03ab..000000000000
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViLT checkpoints from the original Github repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    BertTokenizer,
-    ViltConfig,
-    ViltForImageAndTextRetrieval,
-    ViltForImagesAndTextClassification,
-    ViltForMaskedLM,
-    ViltForQuestionAnswering,
-    ViltImageProcessor,
-    ViltProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, vqa_model=False, nlvr_model=False, irtr_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"transformer.blocks.{i}.norm1.weight", f"vilt.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.norm1.bias", f"vilt.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"transformer.blocks.{i}.attn.proj.weight", f"vilt.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"transformer.blocks.{i}.attn.proj.bias", f"vilt.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"transformer.blocks.{i}.norm2.weight", f"vilt.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.norm2.bias", f"vilt.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"transformer.blocks.{i}.mlp.fc1.weight", f"vilt.encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc1.bias", f"vilt.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.weight", f"vilt.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.bias", f"vilt.encoder.layer.{i}.output.dense.bias"))
-
-    # embeddings
-    rename_keys.extend(
-        [
-            # text embeddings
-            ("text_embeddings.word_embeddings.weight", "vilt.embeddings.text_embeddings.word_embeddings.weight"),
-            (
-                "text_embeddings.position_embeddings.weight",
-                "vilt.embeddings.text_embeddings.position_embeddings.weight",
-            ),
-            ("text_embeddings.position_ids", "vilt.embeddings.text_embeddings.position_ids"),
-            (
-                "text_embeddings.token_type_embeddings.weight",
-                "vilt.embeddings.text_embeddings.token_type_embeddings.weight",
-            ),
-            ("text_embeddings.LayerNorm.weight", "vilt.embeddings.text_embeddings.LayerNorm.weight"),
-            ("text_embeddings.LayerNorm.bias", "vilt.embeddings.text_embeddings.LayerNorm.bias"),
-            # patch embeddings
-            ("transformer.cls_token", "vilt.embeddings.cls_token"),
-            ("transformer.patch_embed.proj.weight", "vilt.embeddings.patch_embeddings.projection.weight"),
-            ("transformer.patch_embed.proj.bias", "vilt.embeddings.patch_embeddings.projection.bias"),
-            ("transformer.pos_embed", "vilt.embeddings.position_embeddings"),
-            # token type embeddings
-            ("token_type_embeddings.weight", "vilt.embeddings.token_type_embeddings.weight"),
-        ]
-    )
-
-    # final layernorm + pooler
-    rename_keys.extend(
-        [
-            ("transformer.norm.weight", "vilt.layernorm.weight"),
-            ("transformer.norm.bias", "vilt.layernorm.bias"),
-            ("pooler.dense.weight", "vilt.pooler.dense.weight"),
-            ("pooler.dense.bias", "vilt.pooler.dense.bias"),
-        ]
-    )
-
-    # classifier head(s)
-    if vqa_model:
-        # classification head
-        rename_keys.extend(
-            [
-                ("vqa_classifier.0.weight", "classifier.0.weight"),
-                ("vqa_classifier.0.bias", "classifier.0.bias"),
-                ("vqa_classifier.1.weight", "classifier.1.weight"),
-                ("vqa_classifier.1.bias", "classifier.1.bias"),
-                ("vqa_classifier.3.weight", "classifier.3.weight"),
-                ("vqa_classifier.3.bias", "classifier.3.bias"),
-            ]
-        )
-    elif nlvr_model:
-        # classification head
-        rename_keys.extend(
-            [
-                ("nlvr2_classifier.0.weight", "classifier.0.weight"),
-                ("nlvr2_classifier.0.bias", "classifier.0.bias"),
-                ("nlvr2_classifier.1.weight", "classifier.1.weight"),
-                ("nlvr2_classifier.1.bias", "classifier.1.bias"),
-                ("nlvr2_classifier.3.weight", "classifier.3.weight"),
-                ("nlvr2_classifier.3.bias", "classifier.3.bias"),
-            ]
-        )
-    else:
-        pass
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        prefix = "vilt."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-@torch.no_grad()
-def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ViLT structure.
-    """
-
-    # define configuration and initialize HuggingFace model
-    config = ViltConfig(image_size=384, patch_size=32, tie_word_embeddings=False)
-    mlm_model = False
-    vqa_model = False
-    nlvr_model = False
-    irtr_model = False
-    if "vqa" in checkpoint_url:
-        vqa_model = True
-        config.num_labels = 3129
-        repo_id = "huggingface/label-files"
-        filename = "vqa2-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        model = ViltForQuestionAnswering(config)
-    elif "nlvr" in checkpoint_url:
-        nlvr_model = True
-        config.num_labels = 2
-        config.id2label = {0: "False", 1: "True"}
-        config.label2id = {v: k for k, v in config.id2label.items()}
-        config.modality_type_vocab_size = 3
-        model = ViltForImagesAndTextClassification(config)
-    elif "irtr" in checkpoint_url:
-        irtr_model = True
-        model = ViltForImageAndTextRetrieval(config)
-    elif "mlm_itm" in checkpoint_url:
-        mlm_model = True
-        model = ViltForMaskedLM(config)
-    else:
-        raise ValueError("Unknown model type")
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-    rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-    if mlm_model or irtr_model:
-        ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"]
-        for k in ignore_keys:
-            state_dict.pop(k, None)
-
-    # load state dict into HuggingFace model
-    model.eval()
-    if mlm_model:
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-        assert missing_keys == ["mlm_score.decoder.bias"]
-    else:
-        model.load_state_dict(state_dict)
-
-    # Define processor
-    image_processor = ViltImageProcessor(size=384)
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    processor = ViltProcessor(image_processor, tokenizer)
-
-    # Forward pass on example inputs (image + text)
-    if nlvr_model:
-        image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
-        text = (
-            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
-            " standing."
-        )
-        encoding_1 = processor(image1, text, return_tensors="pt")
-        encoding_2 = processor(image2, text, return_tensors="pt")
-        outputs = model(
-            input_ids=encoding_1.input_ids,
-            pixel_values=encoding_1.pixel_values,
-            pixel_values_2=encoding_2.pixel_values,
-        )
-    else:
-        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-        if mlm_model:
-            text = "a bunch of [MASK] laying on a [MASK]."
-        else:
-            text = "How many cats are there?"
-        encoding = processor(image, text, return_tensors="pt")
-        outputs = model(**encoding)
-
-    # Verify outputs
-    if mlm_model:
-        expected_shape = torch.Size([1, 11, 30522])
-        expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174])
-        assert outputs.logits.shape == expected_shape
-        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
-
-        # verify masked token prediction equals "cats"
-        predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
-        assert tokenizer.decode([predicted_id]) == "cats"
-    elif vqa_model:
-        expected_shape = torch.Size([1, 3129])
-        expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041])
-        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-        assert outputs.logits.shape == expected_shape
-        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
-
-        # verify vqa prediction equals "2"
-        predicted_idx = outputs.logits.argmax(-1).item()
-        assert model.config.id2label[predicted_idx] == "2"
-    elif nlvr_model:
-        expected_shape = torch.Size([1, 2])
-        expected_slice = torch.tensor([-2.8721, 2.1291])
-        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-        assert outputs.logits.shape == expected_shape
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model and processor to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/dandelin/ViLT/releases/download/200k/vilt_200k_mlm_itm.ckpt",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vilt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py b/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
deleted file mode 100644
index 2914cfdfcd4b..000000000000
--- a/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    CLIPImageProcessor,
-    LlavaProcessor,
-    VipLlavaConfig,
-    VipLlavaForConditionalGeneration,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "final_linear.0": "linear_1",
-    "final_linear.2": "linear_2",
-    "multi_modal_projector.clip_layernorm": "multi_modal_projector.projector_layernorm",
-}
-
-
-# Copied from transformers.models.llava.convert_llava_weights_to_hf.convert_state_dict_to_hf
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_vipllava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = CLIPImageProcessor.from_pretrained(vision_model_id)
-
-    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = VipLlavaConfig(text_config=text_config)
-    config.pad_token_id = 32001
-
-    with torch.device("meta"):
-        model = VipLlavaForConditionalGeneration(config)
-
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-
-    state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict_7b.bin")
-
-    state_dict = torch.load(state_dict_path, map_location="cpu")
-    state_dict = convert_state_dict_to_hf(state_dict)
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_vipllava_llama_to_hf(
-        args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index dd03623c8807..000000000000
--- a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VisualBert checkpoint."""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-
-from transformers import (
-    VisualBertConfig,
-    VisualBertForMultipleChoice,
-    VisualBertForPreTraining,
-    VisualBertForQuestionAnswering,
-    VisualBertForVisualReasoning,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-rename_keys_prefix = [
-    ("bert.bert", "visual_bert"),
-    ("bert.cls", "cls"),
-    ("bert.classifier", "cls"),
-    ("token_type_embeddings_visual", "visual_token_type_embeddings"),
-    ("position_embeddings_visual", "visual_position_embeddings"),
-    ("projection", "visual_projection"),
-]
-
-ACCEPTABLE_CHECKPOINTS = [
-    "nlvr2_coco_pre_trained.th",
-    "nlvr2_fine_tuned.th",
-    "nlvr2_pre_trained.th",
-    "vcr_coco_pre_train.th",
-    "vcr_fine_tune.th",
-    "vcr_pre_train.th",
-    "vqa_coco_pre_trained.th",
-    "vqa_fine_tuned.th",
-    "vqa_pre_trained.th",
-]
-
-
-def load_state_dict(checkpoint_path):
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    return sd
-
-
-def get_new_dict(d, config, rename_keys_prefix=rename_keys_prefix):
-    new_d = OrderedDict()
-    new_d["visual_bert.embeddings.position_ids"] = torch.arange(config.max_position_embeddings).expand((1, -1))
-    # detector_d = OrderedDict()
-    for key in d:
-        if "detector" in key:
-            # detector_d[key.replace('detector.','')] = d[key]
-            continue
-        new_key = key
-        for name_pair in rename_keys_prefix:
-            new_key = new_key.replace(name_pair[0], name_pair[1])
-        new_d[new_key] = d[key]
-        if key == "bert.cls.predictions.decoder.weight":
-            # Old bert code didn't have `decoder.bias`, but was added separately
-            new_d["cls.predictions.decoder.bias"] = new_d["cls.predictions.bias"]
-    return new_d
-
-
-@torch.no_grad()
-def convert_visual_bert_checkpoint(checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our VisualBERT structure.
-    """
-
-    assert (
-        checkpoint_path.split("/")[-1] in ACCEPTABLE_CHECKPOINTS
-    ), f"The checkpoint provided must be in {ACCEPTABLE_CHECKPOINTS}."
-
-    # Get Config
-    if "pre" in checkpoint_path:
-        model_type = "pretraining"
-        if "vcr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 512}
-        elif "vqa_advanced" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-        elif "vqa" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-        elif "nlvr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 1024}
-        else:
-            raise NotImplementedError(f"No implementation found for `{checkpoint_path}`.")
-    else:
-        if "vcr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 512}
-            model_type = "multichoice"
-        elif "vqa_advanced" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-            model_type = "vqa_advanced"
-        elif "vqa" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048, "num_labels": 3129}
-            model_type = "vqa"
-        elif "nlvr" in checkpoint_path:
-            config_params = {
-                "visual_embedding_dim": 1024,
-                "num_labels": 2,
-            }
-            model_type = "nlvr"
-
-    config = VisualBertConfig(**config_params)
-
-    # Load State Dict
-    state_dict = load_state_dict(checkpoint_path)
-
-    new_state_dict = get_new_dict(state_dict, config)
-
-    if model_type == "pretraining":
-        model = VisualBertForPreTraining(config)
-    elif model_type == "vqa":
-        model = VisualBertForQuestionAnswering(config)
-    elif model_type == "nlvr":
-        model = VisualBertForVisualReasoning(config)
-    elif model_type == "multichoice":
-        model = VisualBertForMultipleChoice(config)
-
-    model.load_state_dict(new_state_dict)
-    # Save Checkpoints
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("orig_checkpoint_path", type=str, help="A path to .th on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    convert_visual_bert_checkpoint(args.orig_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
deleted file mode 100644
index 8608da8eb411..000000000000
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT checkpoints trained with the DINO method."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vit.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT configuration
-    config = ViTConfig()
-    # patch_size
-    if model_name[-1] == "8":
-        config.patch_size = 8
-    # set labels if required
-    if not base_model:
-        config.num_labels = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    # size of the architecture
-    if model_name in ["dino_vits8", "dino_vits16"]:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model=base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if base_model:
-        model = ViTModel(config, add_pooling_layer=False).eval()
-    else:
-        model = ViTForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by ViTImageProcessor
-    image_processor = ViTImageProcessor()
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        final_hidden_state_cls_token = original_model(pixel_values)
-        assert torch.allclose(final_hidden_state_cls_token, outputs.last_hidden_state[:, 0, :], atol=1e-1)
-    else:
-        logits = original_model(pixel_values)
-        assert logits.shape == outputs.logits.shape
-        assert torch.allclose(logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dino_vitb16",
-        type=str,
-        help="Name of the model trained with DINO you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--base_model",
-        action="store_true",
-        help="Whether to only convert the base model (no projection head weights).",
-    )
-
-    parser.set_defaults(base_model=True)
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.base_model)
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
deleted file mode 100644
index 7892842f8dc1..000000000000
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT and non-distilled DeiT checkpoints from the timm library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from PIL import Image
-from timm.data import ImageNetInfo, infer_imagenet_subset
-
-from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vit.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT configuration
-    config = ViTConfig()
-    base_model = False
-
-    # load original model from timm
-    timm_model = timm.create_model(vit_name, pretrained=True)
-    timm_model.eval()
-
-    # detect unsupported ViT models in transformers
-    # fc_norm is present
-    if not isinstance(getattr(timm_model, "fc_norm", None), torch.nn.Identity):
-        raise ValueError(f"{vit_name} is not supported in transformers because of the presence of fc_norm.")
-
-    # use of global average pooling in combination (or without) class token
-    if getattr(timm_model, "global_pool", None) == "avg":
-        raise ValueError(f"{vit_name} is not supported in transformers because of use of global average pooling.")
-
-    # CLIP style vit with norm_pre layer present
-    if "clip" in vit_name and not isinstance(getattr(timm_model, "norm_pre", None), torch.nn.Identity):
-        raise ValueError(
-            f"{vit_name} is not supported in transformers because it's a CLIP style ViT with norm_pre layer."
-        )
-
-    # SigLIP style vit with attn_pool layer present
-    if "siglip" in vit_name and getattr(timm_model, "global_pool", None) == "map":
-        raise ValueError(
-            f"{vit_name} is not supported in transformers because it's a SigLIP style ViT with attn_pool."
-        )
-
-    # use of layer scale in ViT model blocks
-    if not isinstance(getattr(timm_model.blocks[0], "ls1", None), torch.nn.Identity) or not isinstance(
-        getattr(timm_model.blocks[0], "ls2", None), torch.nn.Identity
-    ):
-        raise ValueError(f"{vit_name} is not supported in transformers because it uses a layer scale in its blocks.")
-
-    # Hybrid ResNet-ViTs
-    if not isinstance(timm_model.patch_embed, timm.layers.PatchEmbed):
-        raise ValueError(f"{vit_name} is not supported in transformers because it is a hybrid ResNet-ViT.")
-
-    # get patch size and image size from the patch embedding submodule
-    config.patch_size = timm_model.patch_embed.patch_size[0]
-    config.image_size = timm_model.patch_embed.img_size[0]
-
-    # retrieve architecture-specific parameters from the timm model
-    config.hidden_size = timm_model.embed_dim
-    config.intermediate_size = timm_model.blocks[0].mlp.fc1.out_features
-    config.num_hidden_layers = len(timm_model.blocks)
-    config.num_attention_heads = timm_model.blocks[0].attn.num_heads
-
-    # check whether the model has a classification head or not
-    if timm_model.num_classes != 0:
-        config.num_labels = timm_model.num_classes
-        # infer ImageNet subset from timm model
-        imagenet_subset = infer_imagenet_subset(timm_model)
-        dataset_info = ImageNetInfo(imagenet_subset)
-        config.id2label = {i: dataset_info.index_to_label_name(i) for i in range(dataset_info.num_classes())}
-        config.label2id = {v: k for k, v in config.id2label.items()}
-    else:
-        print(f"{vit_name} is going to be converted as a feature extractor only.")
-        base_model = True
-
-    # load state_dict of original model
-    state_dict = timm_model.state_dict()
-
-    # remove and rename some keys in the state dict
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if base_model:
-        model = ViTModel(config, add_pooling_layer=False).eval()
-    else:
-        model = ViTForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
-    if "deit" in vit_name:
-        image_processor = DeiTImageProcessor(size=config.image_size)
-    else:
-        image_processor = ViTImageProcessor(size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.last_hidden_state.shape
-        assert torch.allclose(timm_pooled_output, outputs.last_hidden_state, atol=1e-1)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--vit_name",
-        default="vit_base_patch16_224",
-        type=str,
-        help="Name of the ViT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
deleted file mode 100644
index 47e77593f6fd..000000000000
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor
-
-
-def rename_key(name):
-    if "cls_token" in name:
-        name = name.replace("cls_token", "vit.embeddings.cls_token")
-    if "mask_token" in name:
-        name = name.replace("mask_token", "decoder.mask_token")
-    if "decoder_pos_embed" in name:
-        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
-    if "pos_embed" in name and "decoder" not in name:
-        name = name.replace("pos_embed", "vit.embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "vit.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "vit.embeddings.norm")
-    if "decoder_blocks" in name:
-        name = name.replace("decoder_blocks", "decoder.decoder_layers")
-    if "blocks" in name:
-        name = name.replace("blocks", "vit.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "decoder_embed" in name:
-        name = name.replace("decoder_embed", "decoder.decoder_embed")
-    if "decoder_norm" in name:
-        name = name.replace("decoder_norm", "decoder.decoder_norm")
-    if "decoder_pred" in name:
-        name = name.replace("decoder_pred", "decoder.decoder_pred")
-    if "norm.weight" in name and "decoder" not in name:
-        name = name.replace("norm.weight", "vit.layernorm.weight")
-    if "norm.bias" in name and "decoder" not in name:
-        name = name.replace("norm.bias", "vit.layernorm.bias")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            if "decoder_blocks" in key:
-                dim = config.decoder_hidden_size
-                prefix = "decoder.decoder_layers."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
-            else:
-                dim = config.hidden_size
-                prefix = "vit.encoder.layer."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
-
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    config = ViTMAEConfig()
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "huge" in checkpoint_url:
-        config.patch_size = 14
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-
-    model = ViTMAEForPreTraining(config)
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    image_processor = ViTMAEImageProcessor(size=config.image_size)
-
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
-
-    image = Image.open(requests.get(url, stream=True).raw)
-    image_processor = ViTMAEImageProcessor(size=config.image_size)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    # forward pass
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    if "large" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
-        )
-    elif "huge" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
-        )
-    else:
-        expected_slice = torch.tensor(
-            [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
-        )
-
-    # verify logits
-    assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_base.pth",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_mae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
deleted file mode 100644
index 899c74f18320..000000000000
--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT MSN checkpoints from the original repository: https://github.com/facebookresearch/msn"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-
-torch.set_grad_enabled(False)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"module.blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"module.blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"module.blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append((f"module.blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"module.blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"module.blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("module.cls_token", "vit.embeddings.cls_token"),
-            ("module.patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("module.patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("module.pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("module.norm.weight", "layernorm.weight"),
-                ("module.norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"module.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"module.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def remove_projection_head(state_dict):
-    # projection head is used in the self-supervised pre-training in MSN,
-    # for downstream task it's not needed.
-    ignore_keys = [
-        "module.fc.fc1.weight",
-        "module.fc.fc1.bias",
-        "module.fc.bn1.weight",
-        "module.fc.bn1.bias",
-        "module.fc.bn1.running_mean",
-        "module.fc.bn1.running_var",
-        "module.fc.bn1.num_batches_tracked",
-        "module.fc.fc2.weight",
-        "module.fc.fc2.bias",
-        "module.fc.bn2.weight",
-        "module.fc.bn2.bias",
-        "module.fc.bn2.running_mean",
-        "module.fc.bn2.running_var",
-        "module.fc.bn2.num_batches_tracked",
-        "module.fc.fc3.weight",
-        "module.fc.fc3.bias",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    config = ViTMSNConfig()
-    config.num_labels = 1000
-
-    repo_id = "datasets/huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    if "s16" in checkpoint_url:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_attention_heads = 6
-    elif "l16" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.hidden_dropout_prob = 0.1
-    elif "b4" in checkpoint_url:
-        config.patch_size = 4
-    elif "l7" in checkpoint_url:
-        config.patch_size = 7
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.hidden_dropout_prob = 0.1
-
-    model = ViTMSNModel(config)
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]
-
-    image_processor = ViTImageProcessor(size=config.image_size)
-
-    remove_projection_head(state_dict)
-    rename_keys = create_rename_keys(config, base_model=True)
-
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model=True)
-
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image = Image.open(requests.get(url, stream=True).raw)
-    image_processor = ViTImageProcessor(
-        size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
-    )
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    # forward pass
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    last_hidden_state = outputs.last_hidden_state
-
-    # The following Colab Notebook was used to generate these outputs:
-    # https://colab.research.google.com/gist/sayakpaul/3672419a04f5997827503fd84079bdd1/scratchpad.ipynb
-    if "s16" in checkpoint_url:
-        expected_slice = torch.tensor([[-1.0915, -1.4876, -1.1809]])
-    elif "b16" in checkpoint_url:
-        expected_slice = torch.tensor([[14.2889, -18.9045, 11.7281]])
-    elif "l16" in checkpoint_url:
-        expected_slice = torch.tensor([[41.5028, -22.8681, 45.6475]])
-    elif "b4" in checkpoint_url:
-        expected_slice = torch.tensor([[-4.3868, 5.2932, -0.4137]])
-    else:
-        expected_slice = torch.tensor([[-0.1792, -0.6465, 2.4263]])
-
-    # verify logits
-    assert torch.allclose(last_hidden_state[:, 0, :3], expected_slice, atol=1e-4)
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/msn/vits16_800ep.pth.tar",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_msn_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
deleted file mode 100644
index bcc055633371..000000000000
--- a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VitMatte checkpoints from the original repository.
-
-URL: https://github.com/hustvl/ViTMatte
-"""
-
-import argparse
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor
-
-
-def get_config(model_name):
-    hidden_size = 384 if "small" in model_name else 768
-    num_attention_heads = 6 if "small" in model_name else 12
-
-    backbone_config = VitDetConfig(
-        num_channels=4,
-        image_size=512,
-        pretrain_image_size=224,
-        patch_size=16,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        use_absolute_position_embeddings=True,
-        use_relative_position_embeddings=True,
-        window_size=14,
-        # 2, 5, 8, 11 for global attention
-        window_block_indices=[0, 1, 3, 4, 6, 7, 9, 10],
-        residual_block_indices=[2, 5, 8, 11],
-        out_features=["stage12"],
-    )
-
-    return VitMatteConfig(backbone_config=backbone_config, hidden_size=hidden_size)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("backbone.patch_embed.proj.weight", "backbone.embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.proj.bias", "backbone.embeddings.projection.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    config = get_config(model_name)
-
-    # load original state dict
-    model_name_to_filename = {
-        "vitmatte-small-composition-1k": "ViTMatte_S_Com.pth",
-        "vitmatte-base-composition-1k": "ViTMatte_B_Com.pth",
-        "vitmatte-small-distinctions-646": "ViTMatte_S_DIS.pth",
-        "vitmatte-base-distinctions-646": "ViTMatte_B_DIS.pth",
-    }
-
-    filename = model_name_to_filename[model_name]
-    filepath = hf_hub_download(repo_id="nielsr/vitmatte-checkpoints", filename=filename, repo_type="model")
-    state_dict = torch.load(filepath, map_location="cpu")
-
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "backbone.blocks" in key:
-            key = key.replace("backbone.blocks", "backbone.encoder.layer")
-        if "attn" in key:
-            key = key.replace("attn", "attention")
-        if "fusion_blks" in key:
-            key = key.replace("fusion_blks", "fusion_blocks")
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # create model
-    processor = VitMatteImageProcessor()
-    model = VitMatteForImageMatting(config)
-    model.eval()
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # verify on dummy image + trimap
-    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_rgb.png?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_trimap.png?raw=true"
-    trimap = Image.open(requests.get(url, stream=True).raw)
-
-    pixel_values = processor(images=image, trimaps=trimap.convert("L"), return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        alphas = model(pixel_values).alphas
-
-    if model_name == "vitmatte-small-composition-1k":
-        expected_slice = torch.tensor([[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]])
-    elif model_name == "vitmatte-base-composition-1k":
-        expected_slice = torch.tensor([[0.9972, 0.9971, 0.9981], [0.9948, 0.9987, 0.9994], [0.9963, 0.9992, 0.9995]])
-    elif model_name == "vitmatte-small-distinctions-646":
-        expected_slice = torch.tensor([[0.9880, 0.9970, 0.9972], [0.9960, 0.9996, 0.9997], [0.9963, 0.9996, 0.9997]])
-    elif model_name == "vitmatte-base-distinctions-646":
-        expected_slice = torch.tensor([[0.9963, 0.9998, 0.9999], [0.9995, 1.0000, 1.0000], [0.9992, 0.9999, 1.0000]])
-
-    assert torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"hustvl/{model_name}")
-        processor.push_to_hub(f"hustvl/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="vitmatte-small-composition-1k",
-        type=str,
-        choices=[
-            "vitmatte-small-composition-1k",
-            "vitmatte-base-composition-1k",
-            "vitmatte-small-distinctions-646",
-            "vitmatte-base-distinctions-646",
-        ],
-        help="Name of the VitMatte model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_vitmatte_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
deleted file mode 100644
index f151adebbce7..000000000000
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VitPose checkpoints from the original repository.
-
-URL: https://github.com/vitae-transformer/vitpose
-"""
-
-import argparse
-import os
-import re
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
-
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"patch_embed.proj": "embeddings.patch_embeddings.projection",
-    r"pos_embed": "embeddings.position_embeddings",
-    r"blocks": "encoder.layer",
-    r"attn.proj": "attention.output.dense",
-    r"attn": "attention.self",
-    r"norm1": "layernorm_before",
-    r"norm2": "layernorm_after",
-    r"last_norm": "layernorm",
-    r"keypoint_head": "head",
-    r"final_layer": "conv",
-}
-
-MODEL_TO_FILE_NAME_MAPPING = {
-    "vitpose-base-simple": "vitpose-b-simple.pth",
-    "vitpose-base": "vitpose-b.pth",
-    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
-    "vitpose-plus-base": "vitpose+_base.pth",
-}
-
-
-def get_config(model_name):
-    num_experts = 6 if "plus" in model_name else 1
-    part_features = 192 if "plus" in model_name else 0
-
-    backbone_config = VitPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
-    # size of the architecture
-    if "small" in model_name:
-        backbone_config.hidden_size = 768
-        backbone_config.intermediate_size = 2304
-        backbone_config.num_hidden_layers = 8
-        backbone_config.num_attention_heads = 8
-    elif "large" in model_name:
-        backbone_config.hidden_size = 1024
-        backbone_config.intermediate_size = 4096
-        backbone_config.num_hidden_layers = 24
-        backbone_config.num_attention_heads = 16
-    elif "huge" in model_name:
-        backbone_config.hidden_size = 1280
-        backbone_config.intermediate_size = 5120
-        backbone_config.num_hidden_layers = 32
-        backbone_config.num_attention_heads = 16
-
-    use_simple_decoder = "simple" in model_name
-
-    edges = [
-        [15, 13],
-        [13, 11],
-        [16, 14],
-        [14, 12],
-        [11, 12],
-        [5, 11],
-        [6, 12],
-        [5, 6],
-        [5, 7],
-        [6, 8],
-        [7, 9],
-        [8, 10],
-        [1, 2],
-        [0, 1],
-        [0, 2],
-        [1, 3],
-        [2, 4],
-        [3, 5],
-        [4, 6],
-    ]
-    id2label = {
-        0: "Nose",
-        1: "L_Eye",
-        2: "R_Eye",
-        3: "L_Ear",
-        4: "R_Ear",
-        5: "L_Shoulder",
-        6: "R_Shoulder",
-        7: "L_Elbow",
-        8: "R_Elbow",
-        9: "L_Wrist",
-        10: "R_Wrist",
-        11: "L_Hip",
-        12: "R_Hip",
-        13: "L_Knee",
-        14: "R_Knee",
-        15: "L_Ankle",
-        16: "R_Ankle",
-    }
-
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = VitPoseConfig(
-        backbone_config=backbone_config,
-        num_labels=17,
-        use_simple_decoder=use_simple_decoder,
-        edges=edges,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-@torch.no_grad()
-def write_model(model_path, model_name, push_to_hub, check_logits=True):
-    os.makedirs(model_path, exist_ok=True)
-
-    # ------------------------------------------------------------
-    # Vision model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    config = get_config(model_name)
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    # load original state_dict
-    filename = MODEL_TO_FILE_NAME_MAPPING[model_name]
-    print(f"Fetching all parameters from the checkpoint at {filename}...")
-
-    checkpoint_path = hf_hub_download(
-        repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
-    )
-
-    print("Converting model...")
-    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    all_keys = list(original_state_dict.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    dim = config.backbone_config.hidden_size
-
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-        value = original_state_dict[key]
-
-        if re.search("associate_heads", new_key) or re.search("backbone.cls_token", new_key):
-            # This associated_heads is concept of auxiliary head so does not require in inference stage.
-            # backbone.cls_token is optional forward function for dynamically change of size, see detail in https://github.com/ViTAE-Transformer/ViTPose/issues/34
-            pass
-        elif re.search("qkv", new_key):
-            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
-            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
-            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
-        elif re.search("head", new_key) and not config.use_simple_decoder:
-            # Pattern for deconvolution layers
-            deconv_pattern = r"deconv_layers\.(0|3)\.weight"
-            new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
-            # Pattern for batch normalization layers
-            bn_patterns = [
-                (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
-                (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
-                (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
-                (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
-                (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
-            ]
-
-            for pattern, replacement in bn_patterns:
-                if re.search(pattern, new_key):
-                    # Convert the layer number to the correct batch norm index
-                    layer_num = int(re.search(pattern, key).group(1))
-                    bn_num = layer_num // 3 + 1
-                    new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
-            state_dict[new_key] = value
-        else:
-            state_dict[new_key] = value
-
-    print("Loading the checkpoint in a Vitpose model.")
-    model = VitPoseForPoseEstimation(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-    print("Checkpoint loaded successfully.")
-
-    # create image processor
-    image_processor = VitPoseImageProcessor()
-
-    # verify image processor
-    image = prepare_img()
-    boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
-    pixel_values = image_processor(images=image, boxes=boxes, return_tensors="pt").pixel_values
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
-    assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
-
-    dataset_index = torch.tensor([0])
-
-    with torch.no_grad():
-        # first forward pass
-        outputs = model(pixel_values, dataset_index=dataset_index)
-        output_heatmap = outputs.heatmaps
-
-        # second forward pass (flipped)
-        # this is done since the model uses `flip_test=True` in its test config
-        pixel_values_flipped = torch.flip(pixel_values, [3])
-        outputs_flipped = model(
-            pixel_values_flipped,
-            dataset_index=dataset_index,
-            flip_pairs=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]),
-        )
-        output_flipped_heatmap = outputs_flipped.heatmaps
-
-    outputs.heatmaps = (output_heatmap + output_flipped_heatmap) * 0.5
-
-    # Verify pose_results
-    pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
-
-    if check_logits:
-        if model_name == "vitpose-base-simple":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98180511e02, 1.81808380e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.66642594e-01]),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-base":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.9807913e02, 1.8182812e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.8235235e-01]),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-base-coco-aic-mpii":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98305542e02, 1.81741592e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.69966745e-01]),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-plus-base":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98201294e02, 1.81728302e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.75046968e-01]),
-                atol=5e-2,
-            )
-        else:
-            raise ValueError("Model not supported")
-    print("Conversion successfully done.")
-
-    # save the model to a local directory
-    model.save_pretrained(model_path)
-    image_processor.save_pretrained(model_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"danelcsb/{model_name}")
-        image_processor.push_to_hub(f"danelcsb/{model_name}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="vitpose-base-simple",
-        choices=MODEL_TO_FILE_NAME_MAPPING.keys(),
-        type=str,
-        help="Name of the VitPose model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        help="Whether to check the logits of public converted model to the 🤗 hub. You can disable when using custom model.",
-    )
-
-    args = parser.parse_args()
-    write_model(
-        model_path=args.pytorch_dump_folder_path,
-        model_name=args.model_name,
-        push_to_hub=args.push_to_hub,
-        check_logits=args.check_logits,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/vits/convert_original_checkpoint.py b/src/transformers/models/vits/convert_original_checkpoint.py
deleted file mode 100644
index 267f72ccd08f..000000000000
--- a/src/transformers/models/vits/convert_original_checkpoint.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VITS checkpoint."""
-
-import argparse
-import json
-import tempfile
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import VitsConfig, VitsModel, VitsTokenizer, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.vits")
-
-MAPPING_TEXT_ENCODER = {
-    "enc_p.emb": "text_encoder.embed_tokens",
-    "enc_p.encoder.attn_layers.*.conv_k": "text_encoder.encoder.layers.*.attention.k_proj",
-    "enc_p.encoder.attn_layers.*.conv_v": "text_encoder.encoder.layers.*.attention.v_proj",
-    "enc_p.encoder.attn_layers.*.conv_q": "text_encoder.encoder.layers.*.attention.q_proj",
-    "enc_p.encoder.attn_layers.*.conv_o": "text_encoder.encoder.layers.*.attention.out_proj",
-    "enc_p.encoder.attn_layers.*.emb_rel_k": "text_encoder.encoder.layers.*.attention.emb_rel_k",
-    "enc_p.encoder.attn_layers.*.emb_rel_v": "text_encoder.encoder.layers.*.attention.emb_rel_v",
-    "enc_p.encoder.norm_layers_1.*.gamma": "text_encoder.encoder.layers.*.layer_norm.weight",
-    "enc_p.encoder.norm_layers_1.*.beta": "text_encoder.encoder.layers.*.layer_norm.bias",
-    "enc_p.encoder.ffn_layers.*.conv_1": "text_encoder.encoder.layers.*.feed_forward.conv_1",
-    "enc_p.encoder.ffn_layers.*.conv_2": "text_encoder.encoder.layers.*.feed_forward.conv_2",
-    "enc_p.encoder.norm_layers_2.*.gamma": "text_encoder.encoder.layers.*.final_layer_norm.weight",
-    "enc_p.encoder.norm_layers_2.*.beta": "text_encoder.encoder.layers.*.final_layer_norm.bias",
-    "enc_p.proj": "text_encoder.project",
-}
-MAPPING_STOCHASTIC_DURATION_PREDICTOR = {
-    "dp.pre": "duration_predictor.conv_pre",
-    "dp.proj": "duration_predictor.conv_proj",
-    "dp.convs.convs_sep.*": "duration_predictor.conv_dds.convs_dilated.*",
-    "dp.convs.convs_1x1.*": "duration_predictor.conv_dds.convs_pointwise.*",
-    "dp.convs.norms_1.*.gamma": "duration_predictor.conv_dds.norms_1.*.weight",
-    "dp.convs.norms_1.*.beta": "duration_predictor.conv_dds.norms_1.*.bias",
-    "dp.convs.norms_2.*.gamma": "duration_predictor.conv_dds.norms_2.*.weight",
-    "dp.convs.norms_2.*.beta": "duration_predictor.conv_dds.norms_2.*.bias",
-    "dp.flows.0.logs": "duration_predictor.flows.0.log_scale",
-    "dp.flows.0.m": "duration_predictor.flows.0.translate",
-    "dp.flows.*.pre": "duration_predictor.flows.*.conv_pre",
-    "dp.flows.*.proj": "duration_predictor.flows.*.conv_proj",
-    "dp.flows.*.convs.convs_1x1.0": "duration_predictor.flows.*.conv_dds.convs_pointwise.0",
-    "dp.flows.*.convs.convs_1x1.1": "duration_predictor.flows.*.conv_dds.convs_pointwise.1",
-    "dp.flows.*.convs.convs_1x1.2": "duration_predictor.flows.*.conv_dds.convs_pointwise.2",
-    "dp.flows.*.convs.convs_sep.0": "duration_predictor.flows.*.conv_dds.convs_dilated.0",
-    "dp.flows.*.convs.convs_sep.1": "duration_predictor.flows.*.conv_dds.convs_dilated.1",
-    "dp.flows.*.convs.convs_sep.2": "duration_predictor.flows.*.conv_dds.convs_dilated.2",
-    "dp.flows.*.convs.norms_1.0.gamma": "duration_predictor.flows.*.conv_dds.norms_1.0.weight",
-    "dp.flows.*.convs.norms_1.0.beta": "duration_predictor.flows.*.conv_dds.norms_1.0.bias",
-    "dp.flows.*.convs.norms_1.1.gamma": "duration_predictor.flows.*.conv_dds.norms_1.1.weight",
-    "dp.flows.*.convs.norms_1.1.beta": "duration_predictor.flows.*.conv_dds.norms_1.1.bias",
-    "dp.flows.*.convs.norms_1.2.gamma": "duration_predictor.flows.*.conv_dds.norms_1.2.weight",
-    "dp.flows.*.convs.norms_1.2.beta": "duration_predictor.flows.*.conv_dds.norms_1.2.bias",
-    "dp.flows.*.convs.norms_2.0.gamma": "duration_predictor.flows.*.conv_dds.norms_2.0.weight",
-    "dp.flows.*.convs.norms_2.0.beta": "duration_predictor.flows.*.conv_dds.norms_2.0.bias",
-    "dp.flows.*.convs.norms_2.1.gamma": "duration_predictor.flows.*.conv_dds.norms_2.1.weight",
-    "dp.flows.*.convs.norms_2.1.beta": "duration_predictor.flows.*.conv_dds.norms_2.1.bias",
-    "dp.flows.*.convs.norms_2.2.gamma": "duration_predictor.flows.*.conv_dds.norms_2.2.weight",
-    "dp.flows.*.convs.norms_2.2.beta": "duration_predictor.flows.*.conv_dds.norms_2.2.bias",
-    "dp.post_pre": "duration_predictor.post_conv_pre",
-    "dp.post_proj": "duration_predictor.post_conv_proj",
-    "dp.post_convs.convs_sep.*": "duration_predictor.post_conv_dds.convs_dilated.*",
-    "dp.post_convs.convs_1x1.*": "duration_predictor.post_conv_dds.convs_pointwise.*",
-    "dp.post_convs.norms_1.*.gamma": "duration_predictor.post_conv_dds.norms_1.*.weight",
-    "dp.post_convs.norms_1.*.beta": "duration_predictor.post_conv_dds.norms_1.*.bias",
-    "dp.post_convs.norms_2.*.gamma": "duration_predictor.post_conv_dds.norms_2.*.weight",
-    "dp.post_convs.norms_2.*.beta": "duration_predictor.post_conv_dds.norms_2.*.bias",
-    "dp.post_flows.0.logs": "duration_predictor.post_flows.0.log_scale",
-    "dp.post_flows.0.m": "duration_predictor.post_flows.0.translate",
-    "dp.post_flows.*.pre": "duration_predictor.post_flows.*.conv_pre",
-    "dp.post_flows.*.proj": "duration_predictor.post_flows.*.conv_proj",
-    "dp.post_flows.*.convs.convs_1x1.0": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.0",
-    "dp.post_flows.*.convs.convs_1x1.1": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.1",
-    "dp.post_flows.*.convs.convs_1x1.2": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.2",
-    "dp.post_flows.*.convs.convs_sep.0": "duration_predictor.post_flows.*.conv_dds.convs_dilated.0",
-    "dp.post_flows.*.convs.convs_sep.1": "duration_predictor.post_flows.*.conv_dds.convs_dilated.1",
-    "dp.post_flows.*.convs.convs_sep.2": "duration_predictor.post_flows.*.conv_dds.convs_dilated.2",
-    "dp.post_flows.*.convs.norms_1.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.0.weight",
-    "dp.post_flows.*.convs.norms_1.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.0.bias",
-    "dp.post_flows.*.convs.norms_1.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.1.weight",
-    "dp.post_flows.*.convs.norms_1.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.1.bias",
-    "dp.post_flows.*.convs.norms_1.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.2.weight",
-    "dp.post_flows.*.convs.norms_1.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.2.bias",
-    "dp.post_flows.*.convs.norms_2.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.0.weight",
-    "dp.post_flows.*.convs.norms_2.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.0.bias",
-    "dp.post_flows.*.convs.norms_2.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.1.weight",
-    "dp.post_flows.*.convs.norms_2.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.1.bias",
-    "dp.post_flows.*.convs.norms_2.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.2.weight",
-    "dp.post_flows.*.convs.norms_2.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.2.bias",
-    "dp.cond": "duration_predictor.cond",  # num_speakers > 1
-}
-MAPPING_FLOW = {
-    "flow.flows.*.pre": "flow.flows.*.conv_pre",
-    "flow.flows.*.enc.in_layers.0": "flow.flows.*.wavenet.in_layers.0",
-    "flow.flows.*.enc.in_layers.1": "flow.flows.*.wavenet.in_layers.1",
-    "flow.flows.*.enc.in_layers.2": "flow.flows.*.wavenet.in_layers.2",
-    "flow.flows.*.enc.in_layers.3": "flow.flows.*.wavenet.in_layers.3",
-    "flow.flows.*.enc.res_skip_layers.0": "flow.flows.*.wavenet.res_skip_layers.0",
-    "flow.flows.*.enc.res_skip_layers.1": "flow.flows.*.wavenet.res_skip_layers.1",
-    "flow.flows.*.enc.res_skip_layers.2": "flow.flows.*.wavenet.res_skip_layers.2",
-    "flow.flows.*.enc.res_skip_layers.3": "flow.flows.*.wavenet.res_skip_layers.3",
-    "flow.flows.*.enc.cond_layer": "flow.flows.*.wavenet.cond_layer",  # num_speakers > 1
-    "flow.flows.*.post": "flow.flows.*.conv_post",
-}
-MAPPING_GENERATOR = {
-    "dec.conv_pre": "decoder.conv_pre",
-    "dec.ups.0": "decoder.upsampler.0",
-    "dec.ups.1": "decoder.upsampler.1",
-    "dec.ups.2": "decoder.upsampler.2",
-    "dec.ups.3": "decoder.upsampler.3",
-    "dec.resblocks.*.convs1.0": "decoder.resblocks.*.convs1.0",
-    "dec.resblocks.*.convs1.1": "decoder.resblocks.*.convs1.1",
-    "dec.resblocks.*.convs1.2": "decoder.resblocks.*.convs1.2",
-    "dec.resblocks.*.convs2.0": "decoder.resblocks.*.convs2.0",
-    "dec.resblocks.*.convs2.1": "decoder.resblocks.*.convs2.1",
-    "dec.resblocks.*.convs2.2": "decoder.resblocks.*.convs2.2",
-    "dec.conv_post": "decoder.conv_post",
-    "dec.cond": "decoder.cond",  # num_speakers > 1
-}
-MAPPING_POSTERIOR_ENCODER = {
-    "enc_q.pre": "posterior_encoder.conv_pre",
-    "enc_q.enc.in_layers.*": "posterior_encoder.wavenet.in_layers.*",
-    "enc_q.enc.res_skip_layers.*": "posterior_encoder.wavenet.res_skip_layers.*",
-    "enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer",  # num_speakers > 1
-    "enc_q.proj": "posterior_encoder.conv_proj",
-}
-MAPPING = {
-    **MAPPING_TEXT_ENCODER,
-    **MAPPING_STOCHASTIC_DURATION_PREDICTOR,
-    **MAPPING_FLOW,
-    **MAPPING_GENERATOR,
-    **MAPPING_POSTERIOR_ENCODER,
-    "emb_g": "embed_speaker",  # num_speakers > 1
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    # strip off the kernel dimension at the end (original weights are Conv1d)
-    if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
-        value = value.squeeze(-1)
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(fairseq_dict, hf_model):
-    unused_weights = []
-
-    for name, value in fairseq_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            if key.endswith(".*"):
-                key = key[:-1]
-            elif "*" in key:
-                prefix, suffix = key.split(".*.")
-                if prefix in name and suffix in name:
-                    key = suffix
-
-            if key in name:
-                is_used = True
-                if mapped_key.endswith(".*"):
-                    layer_index = name.split(key)[-1].split(".")[0]
-                    mapped_key = mapped_key.replace("*", layer_index)
-                elif "*" in mapped_key:
-                    layer_index = name.split(key)[0].split(".")[-2]
-
-                    # remap the layer index since we removed the Flip layers
-                    if "flow.flows" in mapped_key:
-                        layer_index = str(int(layer_index) // 2)
-                    if "duration_predictor.flows" in mapped_key or "duration_predictor.post_flows" in mapped_key:
-                        layer_index = str(int(layer_index) // 2 + 1)
-
-                    mapped_key = mapped_key.replace("*", layer_index)
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "weight" in name:
-                    weight_type = "weight"
-                elif "running_mean" in name:
-                    weight_type = "running_mean"
-                elif "running_var" in name:
-                    weight_type = "running_var"
-                elif "num_batches_tracked" in name:
-                    weight_type = "num_batches_tracked"
-                else:
-                    weight_type = None
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-            continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    pytorch_dump_folder_path,
-    checkpoint_path=None,
-    config_path=None,
-    vocab_path=None,
-    language=None,
-    num_speakers=None,
-    sampling_rate=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = VitsConfig.from_pretrained(config_path)
-    else:
-        config = VitsConfig()
-
-    if num_speakers:
-        config.num_speakers = num_speakers
-        config.speaker_embedding_size = 256
-
-    if sampling_rate:
-        config.sampling_rate = sampling_rate
-
-    if checkpoint_path is None:
-        logger.info(f"***Converting model: facebook/mms-tts {language}***")
-
-        vocab_path = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="vocab.txt",
-            subfolder=f"models/{language}",
-        )
-        config_file = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="config.json",
-            subfolder=f"models/{language}",
-        )
-        checkpoint_path = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="G_100000.pth",
-            subfolder=f"models/{language}",
-        )
-
-        with open(config_file, "r") as f:
-            data = f.read()
-            hps = json.loads(data)
-
-        is_uroman = hps["data"]["training_files"].split(".")[-1] == "uroman"
-        if is_uroman:
-            logger.warning("For this checkpoint, you should use `uroman` to convert input text before tokenizing it!")
-    else:
-        logger.info(f"***Converting model: {checkpoint_path}***")
-        is_uroman = False
-
-    # original VITS checkpoint
-    if vocab_path is None:
-        _pad = "_"
-        _punctuation = ';:,.!?¡¿—…"«»“” '
-        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
-        symbols = _pad + _punctuation + _letters + _letters_ipa
-        symbol_to_id = {s: i for i, s in enumerate(symbols)}
-        phonemize = True
-    else:
-        # Save vocab as temporary json file
-        symbols = [line.replace("\n", "") for line in open(vocab_path, encoding="utf-8").readlines()]
-        symbol_to_id = {s: i for i, s in enumerate(symbols)}
-        # MMS-TTS does not use a <pad> token, so we set to the token used to space characters
-        _pad = symbols[0]
-        phonemize = False
-
-    with tempfile.NamedTemporaryFile() as tf:
-        with open(tf.name, "w", encoding="utf-8") as f:
-            f.write(json.dumps(symbol_to_id, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        tokenizer = VitsTokenizer(tf.name, language=language, phonemize=phonemize, is_uroman=is_uroman, pad_token=_pad)
-
-    config.vocab_size = len(symbols)
-    model = VitsModel(config)
-
-    model.decoder.apply_weight_norm()
-
-    orig_checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-    recursively_load_weights(orig_checkpoint["model"], model)
-
-    model.decoder.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        tokenizer.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Local path to original checkpoint")
-    parser.add_argument("--vocab_path", default=None, type=str, help="Path to vocab.txt")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument("--language", default=None, type=str, help="Tokenizer language (three-letter code)")
-    parser.add_argument("--num_speakers", default=None, type=int, help="Number of speakers")
-    parser.add_argument(
-        "--sampling_rate", default=None, type=int, help="Sampling rate on which the model was trained."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.pytorch_dump_folder_path,
-        args.checkpoint_path,
-        args.config_path,
-        args.vocab_path,
-        args.language,
-        args.num_speakers,
-        args.sampling_rate,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
deleted file mode 100644
index c3075d60346e..000000000000
--- a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
-https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
-"""
-
-import argparse
-import json
-import os.path
-from collections import OrderedDict
-
-import numpy as np
-import requests
-import torch
-from flax.training.checkpoints import restore_checkpoint
-from huggingface_hub import hf_hub_download
-
-from transformers import VivitConfig, VivitForVideoClassification, VivitImageProcessor
-from transformers.image_utils import PILImageResampling
-
-
-def download_checkpoint(path):
-    url = "https://storage.googleapis.com/scenic-bucket/vivit/kinetics_400/vivit_base_16x2_unfactorized/checkpoint"
-
-    with open(path, "wb") as f:
-        with requests.get(url, stream=True) as req:
-            for chunk in req.iter_content(chunk_size=2048):
-                f.write(chunk)
-
-
-def get_vivit_config() -> VivitConfig:
-    config = VivitConfig()
-
-    config.num_labels = 400
-    repo_id = "huggingface/label-files"
-    filename = "kinetics400-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    return config
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [ 47, 51, 55, 59, 63, 67, 71, 75, 80, 84, 88, 92, 96, 100, 104, 108, 113, 117,
-# 121, 125, 129, 133, 137, 141, 146, 150, 154, 158, 162, 166, 170, 174]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def transform_attention(current: np.ndarray):
-    if np.ndim(current) == 2:
-        return transform_attention_bias(current)
-
-    elif np.ndim(current) == 3:
-        return transform_attention_kernel(current)
-
-    else:
-        raise Exception(f"Invalid number of dimesions: {np.ndim(current)}")
-
-
-def transform_attention_bias(current: np.ndarray):
-    return current.flatten()
-
-
-def transform_attention_kernel(current: np.ndarray):
-    return np.reshape(current, (current.shape[0], current.shape[1] * current.shape[2])).T
-
-
-def transform_attention_output_weight(current: np.ndarray):
-    return np.reshape(current, (current.shape[0] * current.shape[1], current.shape[2])).T
-
-
-def transform_state_encoder_block(state_dict, i):
-    state = state_dict["optimizer"]["target"]["Transformer"][f"encoderblock_{i}"]
-
-    prefix = f"encoder.layer.{i}."
-    new_state = {
-        prefix + "intermediate.dense.bias": state["MlpBlock_0"]["Dense_0"]["bias"],
-        prefix + "intermediate.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_0"]["kernel"]),
-        prefix + "output.dense.bias": state["MlpBlock_0"]["Dense_1"]["bias"],
-        prefix + "output.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_1"]["kernel"]),
-        prefix + "layernorm_before.bias": state["LayerNorm_0"]["bias"],
-        prefix + "layernorm_before.weight": state["LayerNorm_0"]["scale"],
-        prefix + "layernorm_after.bias": state["LayerNorm_1"]["bias"],
-        prefix + "layernorm_after.weight": state["LayerNorm_1"]["scale"],
-        prefix + "attention.attention.query.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["query"]["bias"]
-        ),
-        prefix + "attention.attention.query.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["query"]["kernel"]
-        ),
-        prefix + "attention.attention.key.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["key"]["bias"]
-        ),
-        prefix + "attention.attention.key.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["key"]["kernel"]
-        ),
-        prefix + "attention.attention.value.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["value"]["bias"]
-        ),
-        prefix + "attention.attention.value.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["value"]["kernel"]
-        ),
-        prefix + "attention.output.dense.bias": state["MultiHeadDotProductAttention_0"]["out"]["bias"],
-        prefix + "attention.output.dense.weight": transform_attention_output_weight(
-            state["MultiHeadDotProductAttention_0"]["out"]["kernel"]
-        ),
-    }
-
-    return new_state
-
-
-def get_n_layers(state_dict):
-    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
-
-
-def transform_state(state_dict, classification_head=False):
-    transformer_layers = get_n_layers(state_dict)
-
-    new_state = OrderedDict()
-
-    new_state["layernorm.bias"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["bias"]
-    new_state["layernorm.weight"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["scale"]
-
-    new_state["embeddings.patch_embeddings.projection.weight"] = np.transpose(
-        state_dict["optimizer"]["target"]["embedding"]["kernel"], (4, 3, 0, 1, 2)
-    )
-    new_state["embeddings.patch_embeddings.projection.bias"] = state_dict["optimizer"]["target"]["embedding"]["bias"]
-
-    new_state["embeddings.cls_token"] = state_dict["optimizer"]["target"]["cls"]
-    new_state["embeddings.position_embeddings"] = state_dict["optimizer"]["target"]["Transformer"]["posembed_input"][
-        "pos_embedding"
-    ]
-
-    for i in range(transformer_layers):
-        new_state.update(transform_state_encoder_block(state_dict, i))
-
-    if classification_head:
-        new_state = {"vivit." + k: v for k, v in new_state.items()}
-        new_state["classifier.weight"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["kernel"])
-        new_state["classifier.bias"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["bias"])
-
-    return {k: torch.tensor(v) for k, v in new_state.items()}
-
-
-# checks that image processor settings are the same as in the original implementation
-# original: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/data/video_tfrecord_dataset.py
-# dataset specific config:
-# https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/kinetics400/vivit_base_k400.py
-def get_processor() -> VivitImageProcessor:
-    extractor = VivitImageProcessor()
-
-    assert extractor.do_resize is True
-    assert extractor.size == {"shortest_edge": 256}
-    assert extractor.do_center_crop is True
-    assert extractor.crop_size == {"width": 224, "height": 224}
-    assert extractor.resample == PILImageResampling.BILINEAR
-
-    # here: https://github.com/deepmind/dmvr/blob/master/dmvr/modalities.py
-    # one can seen that add_image has default values for normalization_mean and normalization_std set to 0 and 1
-    # which effectively means no normalization (and ViViT does not overwrite those when calling this func)
-    assert extractor.do_normalize is False
-    assert extractor.do_rescale is True
-    assert extractor.rescale_factor == 1 / 255
-
-    # zero-centering = True in original implementation
-    assert extractor.do_zero_centering is True
-
-    return extractor
-
-
-def convert(output_path: str):
-    flax_model_path = "checkpoint"
-
-    if not os.path.exists(flax_model_path):
-        download_checkpoint(flax_model_path)
-
-    state_dict = restore_checkpoint(flax_model_path, None)
-    new_state = transform_state(state_dict, classification_head=True)
-
-    config = get_vivit_config()
-
-    assert config.image_size == 224
-    assert config.num_frames == 32
-
-    model = VivitForVideoClassification(config)
-    model.load_state_dict(new_state)
-    model.eval()
-
-    extractor = get_processor()
-
-    video = prepare_video()
-    inputs = extractor(video, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    expected_shape = torch.Size([1, 400])
-    expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4), outputs.logits[0, :5]
-
-    model.save_pretrained(output_path)
-    extractor.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--output_model_name", "-o", type=str, help="Output path for the converted HuggingFace model")
-
-    args = parser.parse_args()
-    convert(args.output_model_name)
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 5613f83a86b4..000000000000
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    Wav2Vec2Config,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForCTC,
-    Wav2Vec2ForPreTraining,
-    Wav2Vec2Processor,
-    logging,
-)
-from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForSequenceClassification
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "adapter_layer": "encoder.layers.*.adapter_layer",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-    "pooling_layer.linear": "projector",
-    "pooling_layer.projection": "classifier",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-    "projector",
-    "classifier",
-]
-
-
-def read_txt_into_dict(filename):
-    result = {}
-    with open(filename, "r") as file:
-        for line_number, line in enumerate(file):
-            line = line.strip()
-            if line:
-                words = line.split()
-                key = line_number
-                value = words[0]
-                result[key] = value
-    return result
-
-
-def set_recursively(key, value, full_name, weight_type, hf_pointer):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
-        if full_name.endswith(param_key):
-            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
-            weight_type = "param"
-
-    # fairseq uses nn.utils.weight_norm() while transformers switches to nn.utils.parametrizations.weight_norm()
-    # the mapping between two versions:
-    # https://github.com/pytorch/pytorch/blob/56935684c3dfad7841c83c719eeebecb560fe466/torch/nn/utils/parametrizations.py#L389-L395
-
-    if weight_type is not None and weight_type != "param":
-        if weight_type == "weight_g" and not hasattr(hf_pointer, "weight_g"):
-            hf_shape = hf_pointer.parametrizations.weight.original0.shape
-        elif weight_type == "weight_v" and not hasattr(hf_pointer, "weight_v"):
-            hf_shape = hf_pointer.parametrizations.weight.original1.shape
-        else:
-            hf_shape = getattr(hf_pointer, weight_type).shape
-    elif weight_type is not None and weight_type == "param":
-        shape_pointer = hf_pointer
-        for attribute in hf_param_name.split("."):
-            shape_pointer = getattr(shape_pointer, attribute)
-        hf_shape = shape_pointer.shape
-
-        # let's reduce dimension
-        value = value[0]
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        if hasattr(hf_pointer, "weight_g"):
-            hf_pointer.weight_g.data = value
-        else:
-            hf_pointer.parametrizations.weight.original0.data = value
-    elif weight_type == "weight_v":
-        if hasattr(hf_pointer, "weight_v"):
-            hf_pointer.weight_v.data = value
-        else:
-            hf_pointer.parametrizations.weight.original1.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "param":
-        for attribute in hf_param_name.split("."):
-            hf_pointer = getattr(hf_pointer, attribute)
-        hf_pointer.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def rename_dict(key, value, full_name, weight_type, hf_dict):
-    hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
-        if full_name.endswith(param_key):
-            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
-            weight_type = "param"
-
-    if weight_type is not None and weight_type != "param":
-        full_key = ".".join([key, weight_type])
-    elif weight_type is not None and weight_type == "param":
-        full_key = ".".join([key, hf_param_name])
-    else:
-        full_key = key
-
-    hf_dict[full_key] = value if "lm_head" in full_key else value[0]
-
-
-PARAM_MAPPING = {
-    "W_a": "linear_1.weight",
-    "W_b": "linear_2.weight",
-    "b_a": "linear_1.bias",
-    "b_b": "linear_2.bias",
-    "ln_W": "norm.weight",
-    "ln_b": "norm.bias",
-}
-
-
-def load_wav2vec2_layer(name, value, hf_model=None, hf_dict=None):
-    is_used = False
-    for key, mapped_key in MAPPING.items():
-        mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-        if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-            is_used = True
-            if "*" in mapped_key:
-                layer_index = name.split(key)[0].split(".")[-2]
-                mapped_key = mapped_key.replace("*", layer_index)
-            if "weight_g" in name:
-                weight_type = "weight_g"
-            elif "weight_v" in name:
-                weight_type = "weight_v"
-            elif "bias" in name:
-                weight_type = "bias"
-            elif "weight" in name:
-                # TODO: don't match quantizer.weight_proj
-                weight_type = "weight"
-            else:
-                weight_type = None
-            if hf_dict is not None:
-                rename_dict(mapped_key, value, name, weight_type, hf_dict)
-            else:
-                set_recursively(mapped_key, value, name, weight_type, hf_model)
-            return is_used
-    return is_used
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.wav2vec2.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            is_used = load_wav2vec2_layer(name, value, hf_model)
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True, is_seq_class=False
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2Config.from_pretrained(config_path)
-    else:
-        config = Wav2Vec2Config()
-
-    if is_seq_class:
-        id2label = read_txt_into_dict(dict_path)
-        config.id2label = id2label
-        hf_wav2vec = Wav2Vec2ForSequenceClassification(config)
-        feature_extractor = Wav2Vec2FeatureExtractor(
-            feature_size=1,
-            sampling_rate=16000,
-            padding_value=0,
-            do_normalize=True,
-            return_attention_mask=True,
-        )
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    elif is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 0
-            vocab_dict["<s>"] = 1
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = Wav2Vec2ForCTC(config)
-    else:
-        hf_wav2vec = Wav2Vec2ForPreTraining(config)
-
-    if is_finetuned or is_seq_class:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        task_arg = argparse.Namespace(task="audio_pretraining")
-        task = fairseq.tasks.setup_task(task_arg)
-
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    parser.add_argument(
-        "--is_seq_class",
-        action="store_true",
-        help="Whether the model to convert is a fine-tuned sequence classification model or not",
-    )
-    args = parser.parse_args()
-
-    is_finetuned = not args.not_finetuned and not args.is_seq_class
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.dict_path,
-        is_finetuned,
-        args.is_seq_class,
-    )
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index 1702bc5a4732..000000000000
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForAudioFrameClassification,
-    Wav2Vec2ForSequenceClassification,
-    Wav2Vec2ForXVector,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = Wav2Vec2Config.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
deleted file mode 100644
index 6405f4547011..000000000000
--- a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2Bert BERT checkpoint."""
-
-import argparse
-
-import torch
-import torchaudio
-from fairseq2.data import Collater
-from fairseq2.data.audio import WaveformToFbankConverter
-from fairseq2.nn.padding import get_seqs_and_padding_mask
-from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
-
-from transformers import (
-    SeamlessM4TFeatureExtractor,
-    Wav2Vec2BertConfig,
-    Wav2Vec2BertModel,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-wav2vec_convert_list = [
-    ("encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("encoder.inner.layers", "encoder.layers"),
-    ("encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("encoder.proj2", "intermediate_ffn.output_dense"),
-    ("encoder.layer_norm", "inner_layer_norm"),
-    ("masker.temporal_mask_embed", "masked_spec_embed"),
-]
-
-keys_to_remove = {
-    "quantizer.entry_proj",
-    "final_proj",
-    "final_target_proj",
-    "quantizer.entries",
-    "quantizer.num_updates",
-}
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-):
-    state_dict = original_model.state_dict()
-
-    for k, v in list(state_dict.items()):
-        new_key = k
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_key:
-                new_key = new_key.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric():
-            new_key = new_key.replace("layer_norm", "final_layer_norm")
-
-        add_key = True
-        for key in keys_to_remove:
-            if key in new_key:
-                state_dict.pop(k)
-                add_key = False
-                break
-
-        if add_key:
-            state_dict[new_key] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set({k for k in extra_keys if "num_updates" not in k})  # filter unecessary param
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_wav2vec2_bert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish")
-    else:
-        config = Wav2Vec2BertConfig(apply_spec_augment=False)
-
-    hf_wav2vec = Wav2Vec2BertModel(config)
-
-    model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32)
-    model.eval()
-
-    hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        hf_wav2vec.push_to_hub(repo_id, create_pr=True)
-
-    # save feature extractor
-    fe = SeamlessM4TFeatureExtractor(padding_value=1)
-    fe._set_processor_class("Wav2Vec2BertProcessor")
-    fe.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        fe.push_to_hub(repo_id, create_pr=True)
-
-    if args.audio_path:
-        waveform, sample_rate = torchaudio.load(args.audio_path)
-        waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate)
-
-        fbank_converter = WaveformToFbankConverter(
-            num_mel_bins=80,
-            waveform_scale=2**15,
-            channel_last=True,
-            standardize=True,
-            dtype=torch.float32,
-        )
-        collater = Collater(pad_value=1)
-
-        decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1}
-        src = collater(fbank_converter(decoded_audio))["fbank"]
-        seqs, padding_mask = get_seqs_and_padding_mask(src)
-
-        with torch.inference_mode():
-            seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
-            original_output, padding_mask = model.encoder(seqs, padding_mask)
-
-        hf_wav2vec.eval()
-
-        inputs = fe(waveform, return_tensors="pt", padding=True)
-        with torch.no_grad():
-            outputs = hf_wav2vec(**inputs)
-
-        torch.testing.assert_close(original_output, outputs.last_hidden_state, atol=5e-3, rtol=5e-3)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint"
-    )
-    parser.add_argument(
-        "--config_path",
-        default=None,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.")
-    parser.add_argument(
-        "--audio_path",
-        default=None,
-        type=str,
-        help="If specified, check that the original model and the converted model produce the same outputs.",
-    )
-
-    args = parser.parse_args()
-    convert_wav2vec2_bert_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id
-    )
diff --git a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8c435c6cd920..000000000000
--- a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2Conformer checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    Wav2Vec2ConformerConfig,
-    Wav2Vec2ConformerForCTC,
-    Wav2Vec2ConformerForPreTraining,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
-    "self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
-    "self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
-    "self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
-    "self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
-    "self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
-    "self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
-    "self_attn.rotary_emb": "encoder.embed_positions",
-    "self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
-    "conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
-    "conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
-    "conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
-    "conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
-    "conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
-    "ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
-    "ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
-    "ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
-    "ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
-    "ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
-    "ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    elif weight_type == "inv_freq":
-        hf_pointer.inv_freq.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "pos_bias_u" in name:
-                        weight_type = None
-                    elif "pos_bias_v" in name:
-                        weight_type = None
-                    elif "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "inv_freq" in name:
-                        weight_type = "inv_freq"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-# Copied from transformers.models.wav2vec2.convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.load_conv_layer
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wav2vec2_conformer_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
-    else:
-        config = Wav2Vec2ConformerConfig()
-
-    if "rope" in checkpoint_path:
-        config.position_embeddings_type = "rotary"
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 0
-            vocab_dict["<s>"] = 1
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = Wav2Vec2ConformerForCTC(config)
-    else:
-        hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        task_arg = argparse.Namespace(task="audio_pretraining")
-        task = fairseq.tasks.setup_task(task_arg)
-
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_wav2vec2_conformer_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index e41ae0881d97..000000000000
--- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert WavLM checkpoint."""
-
-import argparse
-
-import torch
-
-# Step 1. clone https://github.com/microsoft/unilm
-# Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
-# Step 3. cd unilm
-# Step 4. ln -s $(realpath wavlm/modules.py) ./  # create simlink
-# import classes
-from unilm.wavlm.WavLM import WavLM as WavLMOrig
-from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
-
-from transformers import WavLMConfig, WavLMModel, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn.grep_linear": "encoder.layers.*.attention.gru_rel_pos_linear",
-    "self_attn.relative_attention_bias": "encoder.layers.*.attention.rel_attn_embed",
-    "self_attn.grep_a": "encoder.layers.*.attention.gru_rel_pos_const",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "ctc_proj",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "ctc_proj",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name and "relative_attention_bias" not in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    # load the pre-trained checkpoints
-    checkpoint = torch.load(checkpoint_path)
-    cfg = WavLMConfigOrig(checkpoint["cfg"])
-    model = WavLMOrig(cfg)
-    model.load_state_dict(checkpoint["model"])
-    model.eval()
-
-    if config_path is not None:
-        config = WavLMConfig.from_pretrained(config_path)
-    else:
-        config = WavLMConfig()
-
-    hf_wavlm = WavLMModel(config)
-
-    recursively_load_weights(model, hf_wavlm)
-
-    hf_wavlm.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-    convert_wavlm_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index 447d4db67fc4..000000000000
--- a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    Wav2Vec2FeatureExtractor,
-    WavLMConfig,
-    WavLMForAudioFrameClassification,
-    WavLMForSequenceClassification,
-    WavLMForXVector,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = WavLMForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = WavLMForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = WavLMForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = WavLMConfig.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
deleted file mode 100755
index bb63cd24cd61..000000000000
--- a/src/transformers/models/whisper/convert_openai_to_hf.py
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python
-"""Converts a Whisper model in OpenAI format to Hugging Face format."""
-# Copyright 2022 The HuggingFace Inc. team and the OpenAI team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import io
-import json
-import os
-import tempfile
-import urllib
-import warnings
-from typing import Any, List, Optional, Tuple
-
-import torch
-from huggingface_hub.utils import insecure_hashlib
-from torch import nn
-from tqdm import tqdm
-
-from transformers import (
-    GenerationConfig,
-    WhisperConfig,
-    WhisperFeatureExtractor,
-    WhisperForConditionalGeneration,
-    WhisperProcessor,
-    WhisperTokenizer,
-    WhisperTokenizerFast,
-)
-from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode
-from transformers.utils.import_utils import _is_package_available
-
-
-_MODELS = {
-    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
-    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
-    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
-    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
-    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
-    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
-    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
-    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
-    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
-    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
-    "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
-}
-
-
-_TOKENIZERS = {
-    "multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
-    "english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
-}
-
-
-def _get_generation_config(
-    is_multilingual: bool,
-    num_languages: int = 100,
-    openai_version: Optional[str] = None,
-) -> GenerationConfig:
-    """
-    Loads the appropriate generation config from HF repo
-    """
-    if openai_version is not None:
-        repo = f"openai/whisper-{openai_version}"
-    elif not is_multilingual:
-        repo = "openai/whisper-medium.en"
-    elif num_languages < 100:
-        repo = "openai/whisper-large-v2"
-    else:
-        repo = "openai/whisper-large-v3"
-
-    gen_cfg = GenerationConfig.from_pretrained(repo)
-    if openai_version is None:
-        gen_cfg.alignment_heads = None
-        warnings.warn(
-            "Alignment heads have not been included in the generation config, since they are available "
-            "only for the original OpenAI checkpoints."
-            "If you want to use word-level timestamps with a custom version of Whisper,"
-            "see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
-            "for the example of how to produce word-level timestamps manually."
-        )
-
-    return gen_cfg
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["layers", "blocks"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-WHISPER_MAPPING = {
-    "blocks": "layers",
-    "mlp.0": "fc1",
-    "mlp.2": "fc2",
-    "mlp_ln": "final_layer_norm",
-    ".attn.query": ".self_attn.q_proj",
-    ".attn.key": ".self_attn.k_proj",
-    ".attn.value": ".self_attn.v_proj",
-    ".attn_ln": ".self_attn_layer_norm",
-    ".attn.out": ".self_attn.out_proj",
-    ".cross_attn.query": ".encoder_attn.q_proj",
-    ".cross_attn.key": ".encoder_attn.k_proj",
-    ".cross_attn.value": ".encoder_attn.v_proj",
-    ".cross_attn_ln": ".encoder_attn_layer_norm",
-    ".cross_attn.out": ".encoder_attn.out_proj",
-    "decoder.ln.": "decoder.layer_norm.",
-    "encoder.ln.": "encoder.layer_norm.",
-    "token_embedding": "embed_tokens",
-    "encoder.positional_embedding": "encoder.embed_positions.weight",
-    "decoder.positional_embedding": "decoder.embed_positions.weight",
-    "ln_post": "layer_norm",
-}
-
-
-def rename_keys(s_dict):
-    keys = list(s_dict.keys())
-    for key in keys:
-        new_key = key
-        for k, v in WHISPER_MAPPING.items():
-            if k in key:
-                new_key = new_key.replace(k, v)
-
-        print(f"{key} -> {new_key}")
-
-        s_dict[new_key] = s_dict.pop(key)
-    return s_dict
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def _download(url: str, root: str) -> Any:
-    os.makedirs(root, exist_ok=True)
-    filename = os.path.basename(url)
-
-    expected_sha256 = url.split("/")[-2]
-    download_target = os.path.join(root, filename)
-
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-
-    if os.path.isfile(download_target):
-        model_bytes = open(download_target, "rb").read()
-        if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
-            return torch.load(io.BytesIO(model_bytes))
-        else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(
-            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
-        ) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-
-                output.write(buffer)
-                loop.update(len(buffer))
-
-    model_bytes = open(download_target, "rb").read()
-    if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
-        raise RuntimeError(
-            "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
-        )
-
-    return torch.load(io.BytesIO(model_bytes))
-
-
-def convert_openai_whisper_to_tfms(
-    checkpoint_path, pytorch_dump_folder_path
-) -> Tuple[WhisperForConditionalGeneration, bool, int]:
-    if ".pt" not in checkpoint_path:
-        root = os.path.dirname(pytorch_dump_folder_path) or "."
-        original_checkpoint = _download(_MODELS[checkpoint_path], root)
-        openai_version = checkpoint_path
-    else:
-        original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
-        openai_version = None
-
-    dimensions = original_checkpoint["dims"]
-    state_dict = original_checkpoint["model_state_dict"]
-    proj_out_weights = state_dict["decoder.token_embedding.weight"]
-    remove_ignore_keys_(state_dict)
-    rename_keys(state_dict)
-    tie_embeds = True
-    ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]
-
-    # a hacky way to properly set up the bos/eos/pad token ids in the model
-    endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256
-
-    config = WhisperConfig(
-        vocab_size=dimensions["n_vocab"],
-        encoder_ffn_dim=ffn_dim,
-        decoder_ffn_dim=ffn_dim,
-        num_mel_bins=dimensions["n_mels"],
-        d_model=dimensions["n_audio_state"],
-        max_target_positions=dimensions["n_text_ctx"],
-        encoder_layers=dimensions["n_audio_layer"],
-        encoder_attention_heads=dimensions["n_audio_head"],
-        decoder_layers=dimensions["n_text_layer"],
-        decoder_attention_heads=dimensions["n_text_head"],
-        max_source_positions=dimensions["n_audio_ctx"],
-        eos_token_id=endoftext_id,
-        bos_token_id=endoftext_id,
-        pad_token_id=endoftext_id,
-        decoder_start_token_id=endoftext_id + 1,
-    )
-
-    model = WhisperForConditionalGeneration(config)
-    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= {
-        "encoder.embed_positions.weights",
-        "decoder.embed_positions.weights",
-    }:
-        raise ValueError(
-            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
-            f" but all the following weights are missing {missing}"
-        )
-
-    if tie_embeds:
-        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
-    else:
-        model.proj_out.weight.data = proj_out_weights
-
-    # determine those parameters from a model checkpoint as Whisper repo does
-    is_multilingual = model.config.vocab_size >= 51865
-    num_languages = model.config.vocab_size - 51765 - int(is_multilingual)
-
-    model.generation_config = _get_generation_config(
-        is_multilingual,
-        num_languages,
-        openai_version,
-    )
-
-    return model, is_multilingual, num_languages
-
-
-# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
-def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> List[bytes]:
-    parts = [bytes([b]) for b in token]
-    while True:
-        min_idx = None
-        min_rank = None
-        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-            rank = mergeable_ranks.get(pair[0] + pair[1])
-            if rank is not None and (min_rank is None or rank < min_rank):
-                min_idx = i
-                min_rank = rank
-        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-            break
-        assert min_idx is not None
-        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
-    return parts
-
-
-def convert_tiktoken_bpe_to_hf(tiktoken_url: str):
-    bpe_ranks = load_tiktoken_bpe(tiktoken_url)
-    byte_encoder = bytes_to_unicode()
-
-    def token_bytes_to_string(b):
-        return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
-
-    merges = []
-    vocab = {}
-    for token, rank in bpe_ranks.items():
-        vocab[token_bytes_to_string(token)] = rank
-        if len(token) == 1:
-            continue
-        merged = tuple(_bpe(bpe_ranks, token, max_rank=rank))
-        if len(merged) == 2:  # account for empty token
-            merges.append(" ".join(map(token_bytes_to_string, merged)))
-    return vocab, merges
-
-
-def convert_tiktoken_to_hf(
-    multilingual: bool = True, num_languages: int = 100, time_precision=0.02
-) -> WhisperTokenizer:
-    # requires whisper, unless we use the path to the tiktoken file
-    tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
-    start_of_transcript = ["<|endoftext|>", "<|startoftranscript|>"]
-    control_tokens = [
-        "<|translate|>",
-        "<|transcribe|>",
-        "<|startoflm|>",
-        "<|startofprev|>",
-        "<|nospeech|>",
-        "<|notimestamps|>",
-    ]
-    # these are special tokens, not normalized
-    language_tokens = [f"<|{k}|>" for k in list(LANGUAGES)[:num_languages]]
-    # These are not special but normalized
-    timestamp_tokens = [("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)]
-
-    vocab, merges = convert_tiktoken_bpe_to_hf(tiktoken_tokenizer_path)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        vocab_file = f"{tmpdirname}/vocab.json"
-        merge_file = f"{tmpdirname}/merges.txt"
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens in merges:
-                writer.write(bpe_tokens + "\n")
-
-        hf_tokenizer = WhisperTokenizer(vocab_file, merge_file)
-
-    hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
-    hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
-    return hf_tokenizer
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to the downloaded checkpoints")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--convert_preprocessor",
-        type=bool,
-        default=False,
-        help="Whether or not the preprocessor (tokenizer + feature extractor) should be converted along with the model.",
-    )
-    args = parser.parse_args()
-
-    model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
-        args.checkpoint_path, args.pytorch_dump_folder_path
-    )
-
-    if args.convert_preprocessor:
-        try:
-            if not _is_package_available("tiktoken"):
-                raise ModuleNotFoundError(
-                    """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
-                )
-        except Exception as e:
-            print(e)
-        else:
-            from tiktoken.load import load_tiktoken_bpe
-
-            tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
-            feature_extractor = WhisperFeatureExtractor(
-                feature_size=model.config.num_mel_bins,
-                # the rest of default parameters are the same as hardcoded in openai/whisper
-            )
-            processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-            processor.save_pretrained(args.pytorch_dump_folder_path)
-
-            # save fast tokenizer as well
-            fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
-            fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)
-
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 8ff878f2cc9f..000000000000
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    CLIPTokenizer,
-    CLIPTokenizerFast,
-    VideoMAEImageProcessor,
-    XCLIPConfig,
-    XCLIPModel,
-    XCLIPProcessor,
-    XCLIPTextConfig,
-    XCLIPVisionConfig,
-)
-
-
-def get_xclip_config(model_name, num_frames):
-    text_config = XCLIPTextConfig()
-
-    # derive patch size from model name
-    start_idx = model_name.find("patch")
-    patch_size = int(model_name[start_idx + len("patch") : start_idx + len("patch") + 2])
-    vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
-
-    if "large" in model_name:
-        text_config.hidden_size = 768
-        text_config.intermediate_size = 3072
-        text_config.num_attention_heads = 12
-
-        vision_config.hidden_size = 1024
-        vision_config.intermediate_size = 4096
-        vision_config.num_attention_heads = 16
-        vision_config.num_hidden_layers = 24
-        vision_config.mit_hidden_size = 768
-        vision_config.mit_intermediate_size = 3072
-
-    if model_name == "xclip-large-patch14-16-frames":
-        vision_config.image_size = 336
-
-    config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
-
-    if "large" in model_name:
-        config.projection_dim = 768
-
-    return config
-
-
-def rename_key(name):
-    # text encoder
-    if name == "token_embedding.weight":
-        name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
-    if name == "positional_embedding":
-        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if name.startswith("transformer.resblocks"):
-        name = name.replace("transformer.resblocks", "text_model.encoder.layers")
-    if "attn.out_proj" in name and "message" not in name:
-        name = name.replace("attn.out_proj", "self_attn.out_proj")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "text_model.final_layer_norm")
-    # visual encoder
-    if name == "visual.class_embedding":
-        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
-    if name == "visual.positional_embedding":
-        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
-    if name.startswith("visual.transformer.resblocks"):
-        name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
-    if "visual.conv1" in name:
-        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
-    if "visual.ln_pre" in name:
-        name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
-    if "visual.ln_post" in name:
-        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
-    if "visual.proj" in name:
-        name = name.replace("visual.proj", "visual_projection.weight")
-    if "text_projection" in name:
-        name = name.replace("text_projection", "text_projection.weight")
-    # things on top
-    if "prompts_visual_proj" in name:
-        name = name.replace("prompts_visual_proj", "prompts_visual_projection")
-    if "prompts_visual_ln" in name:
-        name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
-    # mit
-    if name == "mit.positional_embedding":
-        name = name.replace("positional", "position")
-    if name.startswith("mit.resblocks"):
-        name = name.replace("mit.resblocks", "mit.encoder.layers")
-    # prompts generator
-    if name.startswith("prompts_generator.norm"):
-        name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "attn.in_proj" in key:
-            key_split = key.split(".")
-            if key.startswith("visual"):
-                layer_num = key_split[3]
-                dim = config.vision_config.hidden_size
-                if "message_attn" in key:
-                    if "weight" in key:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.weight"] = val[
-                            :dim, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.weight"] = val[
-                            dim : dim * 2, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.weight"] = val[
-                            -dim:, :
-                        ]
-                    else:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.bias"] = val[
-                            :dim
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.bias"] = val[
-                            dim : dim * 2
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.bias"] = val[
-                            -dim:
-                        ]
-                else:
-                    if "weight" in key:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[
-                            :dim, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                            dim : dim * 2, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[
-                            -dim:, :
-                        ]
-                    else:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                            dim : dim * 2
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-            elif key.startswith("mit"):
-                layer_num = key_split[2]
-                dim = config.vision_config.mit_hidden_size
-                if "weight" in key:
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-                else:
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-            else:
-                layer_num = key_split[2]
-                dim = config.text_config.hidden_size
-                if "weight" in key:
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                        dim : dim * 2, :
-                    ]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-                else:
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                        dim : dim * 2
-                    ]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_key_name = rename_key(key)
-            if new_key_name in ["visual_projection.weight", "text_projection.weight"]:
-                val = val.T
-            orig_state_dict[new_key_name] = val
-
-    return orig_state_dict
-
-
-def prepare_video(num_frames):
-    if num_frames == 8:
-        filename = "eating_spaghetti_8_frames.npy"
-    elif num_frames == 16:
-        filename = "eating_spaghetti.npy"
-    elif num_frames == 32:
-        filename = "eating_spaghetti_32_frames.npy"
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video",
-        filename=filename,
-        repo_type="dataset",
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    model_to_url = {
-        # fully supervised kinetics-400 checkpoints
-        "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
-        "xclip-base-patch32-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
-        ),
-        "xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
-        "xclip-base-patch16-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
-        ),
-        "xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&amp;export=download&amp;confirm=t&amp;uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
-        "xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&amp;export=download&amp;confirm=t&amp;uuid=538fa810-e671-4050-b385-9a623f89804f",
-        # fully supervised kinetics-600 checkpoints
-        "xclip-base-patch16-kinetics-600": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
-        ),
-        "xclip-base-patch16-kinetics-600-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
-        ),
-        "xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&amp;export=download&amp;confirm=t&amp;uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
-        # few shot
-        "xclip-base-patch16-hmdb-2-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
-        ),
-        "xclip-base-patch16-hmdb-4-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
-        ),
-        "xclip-base-patch16-hmdb-8-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
-        ),
-        "xclip-base-patch16-hmdb-16-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
-        ),
-        "xclip-base-patch16-ucf-2-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
-        ),
-        "xclip-base-patch16-ucf-4-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
-        ),
-        "xclip-base-patch16-ucf-8-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
-        ),
-        "xclip-base-patch16-ucf-16-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
-        ),
-        # zero shot
-        "xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
-    }
-
-    checkpoint_url = model_to_url[model_name]
-    num_frames = 8
-    if "16-frames" in model_name:
-        num_frames = 16
-    elif "shot" in model_name:
-        num_frames = 32
-
-    config = get_xclip_config(model_name, num_frames)
-    model = XCLIPModel(config)
-    model.eval()
-
-    if "drive" in checkpoint_url:
-        output = "pytorch_model.bin"
-        gdown.cached_download(checkpoint_url, output, quiet=False)
-        state_dict = torch.load(output, map_location="cpu")["model"]
-    else:
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-
-    state_dict = convert_state_dict(state_dict, config)
-
-    model = XCLIPModel(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
-    model.eval()
-
-    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
-    image_processor = VideoMAEImageProcessor(size=size)
-    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
-
-    video = prepare_video(num_frames)
-    inputs = processor(
-        text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
-    )
-
-    print("Shape of pixel values:", inputs.pixel_values.shape)
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    # Verify outputs
-    logits_per_video = outputs.logits_per_video
-    probs = logits_per_video.softmax(dim=1)
-    print("Probs:", probs)
-    # kinetics-400
-    if model_name == "xclip-base-patch32":
-        expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
-    elif model_name == "xclip-base-patch32-16-frames":
-        expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
-    elif model_name == "xclip-base-patch16":
-        expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
-    elif model_name == "xclip-base-patch16-16-frames":
-        expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
-    elif model_name == "xclip-large-patch14":
-        expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
-    elif model_name == "xclip-large-patch14-16-frames":
-        expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
-    # kinetics-600
-    elif model_name == "xclip-base-patch16-kinetics-600":
-        expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
-    elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
-        expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
-    elif model_name == "xclip-large-patch14-kinetics-600":
-        expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
-    # few shot
-    elif model_name == "xclip-base-patch16-hmdb-2-shot":
-        expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-4-shot":
-        expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-8-shot":
-        expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-16-shot":
-        expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
-    elif model_name == "xclip-base-patch16-ucf-2-shot":
-        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
-    elif model_name == "xclip-base-patch16-ucf-4-shot":
-        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
-    elif model_name == "xclip-base-patch16-ucf-8-shot":
-        expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
-    elif model_name == "xclip-base-patch16-ucf-16-shot":
-        expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
-    # zero shot
-    elif model_name == "xclip-base-patch16-zero-shot":
-        expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-    assert torch.allclose(probs, expected_probs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model, processor and slow tokenizer files to the hub...")
-        model.push_to_hub(model_name, organization="nielsr")
-        processor.push_to_hub(model_name, organization="nielsr")
-        slow_tokenizer.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="xclip-base-patch32",
-        type=str,
-        help="Name of the model.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py b/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
deleted file mode 100644
index f8b5dba3c1e4..000000000000
--- a/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import argparse
-from argparse import Namespace
-
-import torch
-from torch import nn
-
-from transformers import XGLMConfig, XGLMForCausalLM
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    args = Namespace(**checkpoint["cfg"]["model"])
-    state_dict = checkpoint["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
-
-    state_dict = {key.replace("decoder", "model"): val for key, val in state_dict.items()}
-
-    config = XGLMConfig(
-        vocab_size=vocab_size,
-        max_position_embeddings=args.max_target_positions,
-        num_layers=args.decoder_layers,
-        attention_heads=args.decoder_attention_heads,
-        ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.decoder_embed_dim,
-        layerdrop=args.decoder_layerdrop,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="gelu",
-        scale_embedding=not args.no_scale_embedding,
-        tie_word_embeddings=args.share_decoder_input_output_embed,
-    )
-
-    model = XGLMForCausalLM(config)
-    missing = model.load_state_dict(state_dict, strict=False)
-    print(missing)
-    model.lm_head = make_linear_from_emb(model.model.embed_tokens)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    model = convert_fairseq_xglm_checkpoint_from_disk(args.fairseq_path)
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index 71c3a1f989fd..000000000000
--- a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-import json
-
-import numpy
-import torch
-
-from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
-    # Load checkpoint
-    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
-
-    state_dict = chkpt["model"]
-
-    # We have the base model one level deeper than the original XLM repository
-    two_levels_state_dict = {}
-    for k, v in state_dict.items():
-        if "pred_layer" in k:
-            two_levels_state_dict[k] = v
-        else:
-            two_levels_state_dict["transformer." + k] = v
-
-    config = chkpt["params"]
-    config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}
-
-    vocab = chkpt["dico_word2id"]
-    vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
-
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
-
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(config, indent=2) + "\n")
-
-    print(f"Save vocab file to {pytorch_config_dump_path}")
-    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(vocab, indent=2) + "\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 7f0fec32c387..000000000000
--- a/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-import pathlib
-
-import fairseq
-import torch
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import XLMRobertaConfig, XLMRobertaXLForMaskedLM, XLMRobertaXLForSequenceClassification
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.models.roberta.modeling_roberta import RobertaAttention
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("1.0.0a"):
-    raise Exception("requires fairseq >= 1.0.0a")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_xlm_roberta_xl_checkpoint_to_pytorch(
-    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
-    roberta.eval()  # disable dropout
-    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
-    config = XLMRobertaConfig(
-        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=roberta.cfg.model.encoder_embed_dim,
-        num_hidden_layers=roberta.cfg.model.encoder_layers,
-        num_attention_heads=roberta.cfg.model.encoder_attention_heads,
-        intermediate_size=roberta.cfg.model.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
-
-    print("Our RoBERTa config:", config)
-
-    model = XLMRobertaXLForSequenceClassification(config) if classification_head else XLMRobertaXLForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c RoBERTa doesn't use them.
-
-    model.roberta.encoder.LayerNorm.weight = roberta_sent_encoder.layer_norm.weight
-    model.roberta.encoder.LayerNorm.bias = roberta_sent_encoder.layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        attention: RobertaAttention = layer.attention
-        attention.self_attn_layer_norm.weight = roberta_layer.self_attn_layer_norm.weight
-        attention.self_attn_layer_norm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            roberta_layer.self_attn.k_proj.weight.data.shape
-            == roberta_layer.self_attn.q_proj.weight.data.shape
-            == roberta_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-
-        # this one is final layer norm
-        layer.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        layer.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_xlm_roberta_xl_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index a15c5f22ad68..000000000000
--- a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-import argparse
-import os
-
-import torch
-
-from transformers import (
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetForSequenceClassification,
-    XLNetLMHeadModel,
-    load_tf_weights_in_xlnet,
-)
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
-
-
-logging.set_verbosity_info()
-
-
-def convert_xlnet_checkpoint_to_pytorch(
-    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
-):
-    # Initialise PyTorch model
-    config = XLNetConfig.from_json_file(bert_config_file)
-
-    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
-    if finetuning_task in GLUE_TASKS_NUM_LABELS:
-        print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
-        config.finetuning_task = finetuning_task
-        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
-        model = XLNetForSequenceClassification(config)
-    elif "squad" in finetuning_task:
-        config.finetuning_task = finetuning_task
-        model = XLNetForQuestionAnswering(config)
-    else:
-        model = XLNetLMHeadModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-    print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--xlnet_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained XLNet model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--finetuning_task",
-        default=None,
-        type=str,
-        help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
-    )
-    args = parser.parse_args()
-    print(args)
-
-    convert_xlnet_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
-    )
diff --git a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 6352b7130055..000000000000
--- a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert X-MOD checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import fairseq
-import torch
-from fairseq.models.xmod import XMODModel as FairseqXmodModel
-from packaging import version
-
-from transformers import XmodConfig, XmodForMaskedLM, XmodForSequenceClassification
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.12.2"):
-    raise Exception("requires fairseq >= 0.12.2")
-if version.parse(fairseq.__version__) > version.parse("2"):
-    raise Exception("requires fairseq < v2")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello, World!"
-SAMPLE_LANGUAGE = "en_XX"
-
-
-def convert_xmod_checkpoint_to_pytorch(
-    xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    data_dir = Path("data_bin")
-    xmod = FairseqXmodModel.from_pretrained(
-        model_name_or_path=str(Path(xmod_checkpoint_path).parent),
-        checkpoint_file=Path(xmod_checkpoint_path).name,
-        _name="xmod_base",
-        arch="xmod_base",
-        task="multilingual_masked_lm",
-        data_name_or_path=str(data_dir),
-        bpe="sentencepiece",
-        sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
-        src_dict=str(data_dir / "dict.txt"),
-    )
-    xmod.eval()  # disable dropout
-    print(xmod)
-
-    xmod_sent_encoder = xmod.model.encoder.sentence_encoder
-    config = XmodConfig(
-        vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=xmod.cfg.model.encoder_embed_dim,
-        num_hidden_layers=xmod.cfg.model.encoder_layers,
-        num_attention_heads=xmod.cfg.model.encoder_attention_heads,
-        intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-        pre_norm=xmod.cfg.model.encoder_normalize_before,
-        adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
-        adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
-        adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
-        ln_before_adapter=xmod.cfg.model.ln_before_adapter,
-        languages=xmod.cfg.model.languages,
-    )
-    if classification_head:
-        config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
-
-    print("Our X-MOD config:", config)
-
-    model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c xmod doesn't use them.
-
-    model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
-    model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer = model.roberta.encoder.layer[i]
-        xmod_layer = xmod_sent_encoder.layers[i]
-
-        # self attention
-        self_attn = layer.attention.self
-        if not (
-            xmod_layer.self_attn.k_proj.weight.data.shape
-            == xmod_layer.self_attn.q_proj.weight.data.shape
-            == xmod_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        ):
-            raise AssertionError("Dimensions of self-attention weights do not match.")
-
-        self_attn.query.weight.data = xmod_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = xmod_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = xmod_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = xmod_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = xmod_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = xmod_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output = layer.attention.output
-        if self_output.dense.weight.shape != xmod_layer.self_attn.out_proj.weight.shape:
-            raise AssertionError("Dimensions of self-attention output weights do not match.")
-        self_output.dense.weight = xmod_layer.self_attn.out_proj.weight
-        self_output.dense.bias = xmod_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = xmod_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = xmod_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate = layer.intermediate
-        if intermediate.dense.weight.shape != xmod_layer.fc1.weight.shape:
-            raise AssertionError("Dimensions of intermediate weights do not match.")
-        intermediate.dense.weight = xmod_layer.fc1.weight
-        intermediate.dense.bias = xmod_layer.fc1.bias
-
-        # output
-        bert_output = layer.output
-        if bert_output.dense.weight.shape != xmod_layer.fc2.weight.shape:
-            raise AssertionError("Dimensions of feed-forward weights do not match.")
-        bert_output.dense.weight = xmod_layer.fc2.weight
-        bert_output.dense.bias = xmod_layer.fc2.bias
-        bert_output.LayerNorm.weight = xmod_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = xmod_layer.final_layer_norm.bias
-        if bert_output.adapter_layer_norm is not None:
-            bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
-            bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
-
-        if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
-            raise AssertionError("Lists of language adapters do not match.")
-        for lang_code, adapter in xmod_layer.adapter_modules.items():
-            to_adapter = bert_output.adapter_modules[lang_code]
-            from_adapter = xmod_layer.adapter_modules[lang_code]
-            to_adapter.dense1.weight = from_adapter.fc1.weight
-            to_adapter.dense1.bias = from_adapter.fc1.bias
-            to_adapter.dense2.weight = from_adapter.fc2.weight
-            to_adapter.dense2.bias = from_adapter.fc2.bias
-
-        # end of layer
-
-    if xmod_sent_encoder.layer_norm is not None:
-        model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
-        model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
-
-    if classification_head:
-        model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-    model.roberta.set_default_language(SAMPLE_LANGUAGE)
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
-    else:
-        their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_xmod_checkpoint_to_pytorch(
-        args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
deleted file mode 100644
index 6cddc606614c..000000000000
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_yolos_config(yolos_name: str) -> YolosConfig:
-    config = YolosConfig()
-
-    # size of the architecture
-    if "yolos_ti" in yolos_name:
-        config.hidden_size = 192
-        config.intermediate_size = 768
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 3
-        config.image_size = [800, 1333]
-        config.use_mid_position_embeddings = False
-    elif yolos_name == "yolos_s_dWr":
-        config.hidden_size = 330
-        config.num_hidden_layers = 14
-        config.num_attention_heads = 6
-        config.intermediate_size = 1320
-    elif "yolos_s" in yolos_name:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-    elif "yolos_b" in yolos_name:
-        config.image_size = [800, 1344]
-
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict: dict, config: YolosConfig, base_model: bool = False):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(name: str) -> str:
-    if "backbone" in name:
-        name = name.replace("backbone", "vit")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "embeddings.cls_token")
-    if "det_token" in name:
-        name = name.replace("det_token", "embeddings.detection_tokens")
-    if "mid_pos_embed" in name:
-        name = name.replace("mid_pos_embed", "encoder.mid_position_embeddings")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "class_embed" in name:
-        name = name.replace("class_embed", "class_labels_classifier")
-    if "bbox_embed" in name:
-        name = name.replace("bbox_embed", "bbox_predictor")
-    if "vit.norm" in name:
-        name = name.replace("vit.norm", "vit.layernorm")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[2])
-            dim = model.vit.encoder.layer[layer_num].attention.attention.all_head_size
-            if "weight" in key:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_yolos_checkpoint(
-    yolos_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our YOLOS structure.
-    """
-    config = get_yolos_config(yolos_name)
-
-    # load original state_dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # load 🤗 model
-    model = YolosForObjectDetection(config)
-    model.eval()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # Check outputs on an image, prepared by YolosImageProcessor
-    size = 800 if yolos_name != "yolos_ti" else 512
-    image_processor = YolosImageProcessor(format="coco_detection", size=size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits, pred_boxes = outputs.logits, outputs.pred_boxes
-
-    expected_slice_logits, expected_slice_boxes = None, None
-    if yolos_name == "yolos_ti":
-        expected_slice_logits = torch.tensor(
-            [[-39.5022, -11.9820, -17.6888], [-29.9574, -9.9769, -17.7691], [-42.3281, -20.7200, -30.6294]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.4021, 0.0836, 0.7979], [0.0184, 0.2609, 0.0364], [0.1781, 0.2004, 0.2095]]
-        )
-    elif yolos_name == "yolos_s_200_pre":
-        expected_slice_logits = torch.tensor(
-            [[-24.0248, -10.3024, -14.8290], [-42.0392, -16.8200, -27.4334], [-27.2743, -11.8154, -18.7148]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.2559, 0.5455, 0.4706], [0.2989, 0.7279, 0.1875], [0.7732, 0.4017, 0.4462]]
-        )
-    elif yolos_name == "yolos_s_300_pre":
-        expected_slice_logits = torch.tensor(
-            [[-36.2220, -14.4385, -23.5457], [-35.6970, -14.7583, -21.3935], [-31.5939, -13.6042, -16.8049]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.7614, 0.2316, 0.4728], [0.7168, 0.4495, 0.3855], [0.4996, 0.1466, 0.9996]]
-        )
-    elif yolos_name == "yolos_s_dWr":
-        expected_slice_logits = torch.tensor(
-            [[-42.8668, -24.1049, -41.1690], [-34.7456, -14.1274, -24.9194], [-33.7898, -12.1946, -25.6495]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.5587, 0.2773, 0.0605], [0.5004, 0.3014, 0.9994], [0.4999, 0.1548, 0.9994]]
-        )
-    elif yolos_name == "yolos_base":
-        expected_slice_logits = torch.tensor(
-            [[-40.6064, -24.3084, -32.6447], [-55.1990, -30.7719, -35.5877], [-51.4311, -33.3507, -35.6462]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.5555, 0.2794, 0.0655], [0.9049, 0.2664, 0.1894], [0.9183, 0.1984, 0.1635]]
-        )
-    else:
-        raise ValueError(f"Unknown yolos_name: {yolos_name}")
-
-    assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4)
-    assert torch.allclose(pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_mapping = {
-            "yolos_ti": "yolos-tiny",
-            "yolos_s_200_pre": "yolos-small",
-            "yolos_s_300_pre": "yolos-small-300",
-            "yolos_s_dWr": "yolos-small-dwr",
-            "yolos_base": "yolos-base",
-        }
-
-        print("Pushing to the hub...")
-        model_name = model_mapping[yolos_name]
-        image_processor.push_to_hub(model_name, organization="hustvl")
-        model.push_to_hub(model_name, organization="hustvl")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--yolos_name",
-        default="yolos_s_200_pre",
-        type=str,
-        help=(
-            "Name of the YOLOS model you'd like to convert. Should be one of 'yolos_ti', 'yolos_s_200_pre',"
-            " 'yolos_s_300_pre', 'yolos_s_dWr', 'yolos_base'."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original state dict (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_yolos_checkpoint(args.yolos_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
deleted file mode 100644
index be46a4de81b3..000000000000
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert YOSO checkpoints from the original repository. URL: https://github.com/mlpen/YOSO"""
-
-import argparse
-
-import torch
-
-from transformers import YosoConfig, YosoForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff1" in orig_key:
-        orig_key = orig_key.replace("ff1", "intermediate.dense")
-    if "ff2" in orig_key:
-        orig_key = orig_key.replace("ff2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "cls" not in orig_key:
-        orig_key = "yoso." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["yoso.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
-
-    return orig_state_dict
-
-
-def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = YosoConfig.from_json_file(yoso_config_file)
-    model = YosoForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
-
-    print(model.load_state_dict(new_state_dict))
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to YOSO pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for YOSO model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_yoso_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
deleted file mode 100644
index 9a6701c35bcd..000000000000
--- a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ZoeDepth checkpoints from the original repository. URL: https://github.com/isl-org/ZoeDepth.
-
-Original logits where obtained by running the following code:
-!git clone -b understanding_zoedepth https://github.com/NielsRogge/ZoeDepth
-!python inference.py
-"""
-
-import argparse
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import BeitConfig, ZoeDepthConfig, ZoeDepthForDepthEstimation, ZoeDepthImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_zoedepth_config(model_name):
-    image_size = 384
-    backbone_config = BeitConfig(
-        image_size=image_size,
-        num_hidden_layers=24,
-        hidden_size=1024,
-        intermediate_size=4096,
-        num_attention_heads=16,
-        use_relative_position_bias=True,
-        reshape_hidden_states=False,
-        out_features=["stage6", "stage12", "stage18", "stage24"],  # beit-large-512 uses [5, 11, 17, 23],
-    )
-
-    neck_hidden_sizes = [256, 512, 1024, 1024]
-    bin_centers_type = "softplus" if model_name in ["ZoeD_N", "ZoeD_NK"] else "normed"
-    if model_name == "ZoeD_NK":
-        bin_configurations = [
-            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
-            {"name": "kitti", "n_bins": 64, "min_depth": 1e-3, "max_depth": 80.0},
-        ]
-    elif model_name in ["ZoeD_N", "ZoeD_K"]:
-        bin_configurations = [
-            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
-        ]
-    config = ZoeDepthConfig(
-        backbone_config=backbone_config,
-        neck_hidden_sizes=neck_hidden_sizes,
-        bin_centers_type=bin_centers_type,
-        bin_configurations=bin_configurations,
-        num_patch_transformer_layers=4 if model_name == "ZoeD_NK" else None,
-        patch_transformer_hidden_size=128 if model_name == "ZoeD_NK" else None,
-        patch_transformer_intermediate_size=1024 if model_name == "ZoeD_NK" else None,
-        patch_transformer_num_attention_heads=4 if model_name == "ZoeD_NK" else None,
-    )
-
-    return config, image_size
-
-
-def rename_key(name):
-    # Transformer backbone
-    if "core.core.pretrained.model.blocks" in name:
-        name = name.replace("core.core.pretrained.model.blocks", "backbone.encoder.layer")
-    if "core.core.pretrained.model.patch_embed.proj" in name:
-        name = name.replace(
-            "core.core.pretrained.model.patch_embed.proj", "backbone.embeddings.patch_embeddings.projection"
-        )
-    if "core.core.pretrained.model.cls_token" in name:
-        name = name.replace("core.core.pretrained.model.cls_token", "backbone.embeddings.cls_token")
-    if "norm1" in name and "patch_transformer" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name and "patch_transformer" not in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "gamma_1" in name:
-        name = name.replace("gamma_1", "lambda_1")
-    if "gamma_2" in name:
-        name = name.replace("gamma_2", "lambda_2")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn.relative_position_bias_table" in name:
-        name = name.replace(
-            "attn.relative_position_bias_table",
-            "attention.attention.relative_position_bias.relative_position_bias_table",
-        )
-    if "attn.relative_position_index" in name:
-        name = name.replace(
-            "attn.relative_position_index", "attention.attention.relative_position_bias.relative_position_index"
-        )
-
-    # activation postprocessing (readout projections + resize blocks)
-    if "core.core.pretrained.act_postprocess1.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess1.0.project", "neck.reassemble_stage.readout_projects.0"
-        )
-    if "core.core.pretrained.act_postprocess2.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess2.0.project", "neck.reassemble_stage.readout_projects.1"
-        )
-    if "core.core.pretrained.act_postprocess3.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess3.0.project", "neck.reassemble_stage.readout_projects.2"
-        )
-    if "core.core.pretrained.act_postprocess4.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess4.0.project", "neck.reassemble_stage.readout_projects.3"
-        )
-
-    if "core.core.pretrained.act_postprocess1.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "core.core.pretrained.act_postprocess2.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "core.core.pretrained.act_postprocess3.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "core.core.pretrained.act_postprocess4.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-
-    if "core.core.pretrained.act_postprocess1.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "core.core.pretrained.act_postprocess2.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "core.core.pretrained.act_postprocess4.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-
-    # scratch convolutions
-    if "core.core.scratch.layer1_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer1_rn.weight", "neck.convs.0.weight")
-    if "core.core.scratch.layer2_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer2_rn.weight", "neck.convs.1.weight")
-    if "core.core.scratch.layer3_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer3_rn.weight", "neck.convs.2.weight")
-    if "core.core.scratch.layer4_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer4_rn.weight", "neck.convs.3.weight")
-
-    # fusion layers
-    # tricky here: mapping = {1:3, 2:2, 3:1, 4:0}
-    if "core.core.scratch.refinenet1" in name:
-        name = name.replace("core.core.scratch.refinenet1", "neck.fusion_stage.layers.3")
-    if "core.core.scratch.refinenet2" in name:
-        name = name.replace("core.core.scratch.refinenet2", "neck.fusion_stage.layers.2")
-    if "core.core.scratch.refinenet3" in name:
-        name = name.replace("core.core.scratch.refinenet3", "neck.fusion_stage.layers.1")
-    if "core.core.scratch.refinenet4" in name:
-        name = name.replace("core.core.scratch.refinenet4", "neck.fusion_stage.layers.0")
-
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-
-    if "conv2" in name and "residual_layer" in name:
-        name = name.replace("conv2", "convolution2")
-
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-
-    # relative depth estimation head
-    if "core.core.scratch.output_conv.0" in name:
-        name = name.replace("core.core.scratch.output_conv.0", "relative_head.conv1")
-
-    if "core.core.scratch.output_conv.2" in name:
-        name = name.replace("core.core.scratch.output_conv.2", "relative_head.conv2")
-
-    if "core.core.scratch.output_conv.4" in name:
-        name = name.replace("core.core.scratch.output_conv.4", "relative_head.conv3")
-
-    # patch transformer
-    if "patch_transformer" in name:
-        name = name.replace("patch_transformer", "metric_head.patch_transformer")
-
-    if "mlp_classifier.0" in name:
-        name = name.replace("mlp_classifier.0", "metric_head.mlp_classifier.linear1")
-    if "mlp_classifier.2" in name:
-        name = name.replace("mlp_classifier.2", "metric_head.mlp_classifier.linear2")
-
-    if "projectors" in name:
-        name = name.replace("projectors", "metric_head.projectors")
-
-    if "seed_bin_regressors" in name:
-        name = name.replace("seed_bin_regressors", "metric_head.seed_bin_regressors")
-
-    if "seed_bin_regressor" in name and "seed_bin_regressors" not in name:
-        name = name.replace("seed_bin_regressor", "metric_head.seed_bin_regressor")
-
-    if "seed_projector" in name:
-        name = name.replace("seed_projector", "metric_head.seed_projector")
-
-    if "_net.0" in name:
-        name = name.replace("_net.0", "conv1")
-
-    if "_net.2" in name:
-        name = name.replace("_net.2", "conv2")
-
-    if "attractors" in name:
-        name = name.replace("attractors", "metric_head.attractors")
-
-    if "conditional_log_binomial" in name:
-        name = name.replace("conditional_log_binomial", "metric_head.conditional_log_binomial")
-
-    # metric depth estimation head
-    if "conv2" in name and "metric_head" not in name and "attractors" not in name and "relative_head" not in name:
-        name = name.replace("conv2", "metric_head.conv2")
-
-    if "transformer_encoder.layers" in name:
-        name = name.replace("transformer_encoder.layers", "transformer_encoder")
-
-    return name
-
-
-def read_in_q_k_v_metric_head(state_dict):
-    hidden_size = 128
-    for i in range(4):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.weight"] = in_proj_weight[
-            :hidden_size, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.weight"] = in_proj_weight[
-            -hidden_size:, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def convert_state_dict(orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        # rename key
-        new_name = rename_key(key)
-        orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-def remove_ignore_keys(state_dict):
-    for key, _ in state_dict.copy().items():
-        if (
-            "fc_norm" in key
-            or "relative_position_index" in key
-            or "k_idx" in key
-            or "K_minus_1" in key
-            or "core.core.pretrained.model.head" in key
-        ):
-            state_dict.pop(key, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.v_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-
-# We will verify our results on an image
-def prepare_img():
-    filepath = hf_hub_download(repo_id="shariqfarooq/ZoeDepth", filename="examples/person_1.jpeg", repo_type="space")
-    image = Image.open(filepath).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_zoedepth_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ZoeDepth structure.
-    """
-
-    # define ZoeDepth configuration based on URL
-    config, _ = get_zoedepth_config(model_name)
-
-    # load original model
-    original_model = torch.hub.load(
-        "NielsRogge/ZoeDepth:understanding_zoedepth", model_name, pretrained=True, force_reload=True
-    )
-    original_model.eval()
-    state_dict = original_model.state_dict()
-
-    print("Original state dict:")
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-    if model_name == "ZoeD_NK":
-        read_in_q_k_v_metric_head(state_dict)
-
-    # rename keys
-    state_dict = convert_state_dict(state_dict)
-    # remove certain keys
-    remove_ignore_keys(state_dict)
-
-    # load HuggingFace model
-    model = ZoeDepthForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify image processor
-    image = prepare_img()
-
-    image_processor = ZoeDepthImageProcessor()
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values
-    filepath = hf_hub_download(
-        repo_id="nielsr/test-image",
-        filename="zoedepth_pixel_values.pt",
-        repo_type="dataset",
-    )
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(pixel_values, original_pixel_values)
-
-    # verify logits
-    # this was done on a resized version of the cats image (384x384)
-    filepath = hf_hub_download(
-        repo_id="nielsr/test-image",
-        filename="zoedepth_pixel_values.pt",
-        repo_type="dataset",
-        revision="1865dbb81984f01c89e83eec10f8d07efd10743d",
-    )
-    cats_pixel_values = torch.load(filepath, map_location="cpu")
-    depth = model(cats_pixel_values).predicted_depth
-
-    # Verify logits
-    # These were obtained by inserting the pixel_values at the patch embeddings of BEiT
-    if model_name == "ZoeD_N":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.0328, 1.0604, 1.0747], [1.0816, 1.1293, 1.1456], [1.1117, 1.1629, 1.1766]])
-    elif model_name == "ZoeD_K":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.6567, 1.6852, 1.7065], [1.6707, 1.6764, 1.6713], [1.7195, 1.7166, 1.7118]])
-    elif model_name == "ZoeD_NK":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.1228, 1.1079, 1.1382], [1.1807, 1.1658, 1.1891], [1.2344, 1.2094, 1.2317]])
-
-    print("Shape of depth:", depth.shape)
-    print("First 3x3 slice of depth:", depth[0, :3, :3])
-
-    assert depth.shape == torch.Size(expected_shape)
-    assert torch.allclose(depth[0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_repo_id = {
-            "ZoeD_N": "zoedepth-nyu",
-            "ZoeD_K": "zoedepth-kitti",
-            "ZoeD_NK": "zoedepth-nyu-kitti",
-        }
-
-        print("Pushing model and processor to the hub...")
-        repo_id = model_name_to_repo_id[model_name]
-        model.push_to_hub(f"Intel/{repo_id}")
-        image_processor = ZoeDepthImageProcessor()
-        image_processor.push_to_hub(f"Intel/{repo_id}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ZoeD_N",
-        choices=["ZoeD_N", "ZoeD_K", "ZoeD_NK"],
-        type=str,
-        help="Name of the original ZoeDepth checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-
-    args = parser.parse_args()
-    convert_zoedepth_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)