From e39c9f7a78fa2960a7045e8fc5a2d96b5d7eebf1 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 10 Jan 2025 10:12:04 +0100 Subject: [PATCH] v4.48-release --- examples/flax/question-answering/run_qa.py | 2 +- .../run_flax_speech_recognition_seq2seq.py | 2 +- .../flax/text-classification/run_flax_glue.py | 2 +- .../flax/token-classification/run_flax_ner.py | 2 +- .../run_audio_classification.py | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../image-pretraining/run_mim_no_trainer.py | 2 +- .../run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- .../language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- .../language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- .../language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- .../multiple-choice/run_swag_no_trainer.py | 2 +- .../object-detection/run_object_detection.py | 2 +- .../run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- .../question-answering/run_qa_beam_search.py | 2 +- .../run_qa_beam_search_no_trainer.py | 2 +- .../question-answering/run_qa_no_trainer.py | 2 +- .../question-answering/run_seq2seq_qa.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../run_speech_recognition_ctc.py | 2 +- .../run_speech_recognition_ctc_adapter.py | 2 +- .../run_speech_recognition_seq2seq.py | 2 +- .../summarization/run_summarization.py | 2 +- .../run_summarization_no_trainer.py | 2 +- .../text-classification/run_classification.py | 2 +- .../pytorch/text-classification/run_glue.py | 2 +- .../run_glue_no_trainer.py | 2 +- .../pytorch/text-classification/run_xnli.py | 2 +- .../pytorch/token-classification/run_ner.py | 2 +- .../run_ner_no_trainer.py | 2 +- .../pytorch/translation/run_translation.py | 2 +- .../translation/run_translation_no_trainer.py | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../tensorflow/multiple-choice/run_swag.py | 2 +- .../tensorflow/question-answering/run_qa.py | 2 +- .../summarization/run_summarization.py | 2 +- .../text-classification/run_glue.py | 2 +- .../tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- ...lbert_original_tf_checkpoint_to_pytorch.py | 62 - .../models/align/convert_align_tf_to_hf.py | 389 ----- .../models/aria/convert_aria_weights_to_hf.py | 162 -- ...trogram_transformer_original_to_pytorch.py | 279 ---- .../bamba/convert_mamba_ssm_checkpoint.py | 273 ---- .../models/bark/convert_suno_to_hf.py | 263 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 156 -- .../beit/convert_beit_unilm_to_pytorch.py | 373 ----- ...bert_original_tf2_checkpoint_to_pytorch.py | 246 --- ..._bert_original_tf_checkpoint_to_pytorch.py | 62 - ..._bert_pytorch_checkpoint_to_original_tf.py | 112 -- ...ping_original_tf2_checkpoint_to_pytorch.py | 188 --- ...gbird_original_tf_checkpoint_to_pytorch.py | 69 - .../convert_bigbird_pegasus_tf_to_pytorch.py | 170 --- ..._original_pytorch_checkpoint_to_pytorch.py | 292 ---- .../models/bit/convert_bit_to_pytorch.py | 177 --- ..._original_pytorch_checkpoint_to_pytorch.py | 114 -- .../convert_blip_original_pytorch_to_hf.py | 191 --- .../convert_blip_2_original_to_pytorch.py | 390 ----- ...rt_bloom_original_checkpoint_to_pytorch.py | 254 ---- .../models/bros/convert_bros_to_pytorch.py | 145 -- ..._byt5_original_tf_checkpoint_to_pytorch.py | 59 - ...anine_original_tf_checkpoint_to_pytorch.py | 65 - .../convert_chameleon_weights_to_hf.py | 476 ------ ...ert_chinese_clip_original_pytorch_to_hf.py | 134 -- .../convert_clap_original_pytorch_to_hf.py | 133 -- .../convert_clip_original_pytorch_to_hf.py | 156 -- .../convert_clipseg_original_pytorch_to_hf.py | 264 ---- .../models/clvp/convert_clvp_to_hf.py | 234 --- .../colpali/convert_colpali_weights_to_hf.py | 214 --- ..._original_pytorch_checkpoint_to_pytorch.py | 324 ---- ...ginal_tf1_checkpoint_to_pytorch_and_tf2.py | 57 - .../convnext/convert_convnext_to_pytorch.py | 242 --- .../convert_convnextv2_to_pytorch.py | 286 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 362 ----- .../models/dac/convert_dac_checkpoint.py | 261 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 285 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 207 --- ..._original_pytorch_checkpoint_to_pytorch.py | 374 ----- .../convert_deformable_detr_to_pytorch.py | 236 --- .../deit/convert_deit_timm_to_pytorch.py | 218 --- ...original_gluonnlp_checkpoint_to_pytorch.py | 318 ---- .../deta/convert_deta_resnet_to_pytorch.py | 319 ---- .../deta/convert_deta_swin_to_pytorch.py | 326 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 252 ---- ...convert_gptsan_tf_checkpoint_to_pytorch.py | 181 --- .../deprecated/jukebox/convert_jukebox.py | 279 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 292 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 70 - ...fo_xl_original_tf_checkpoint_to_pytorch.py | 121 -- .../deprecated/van/convert_van_to_pytorch.py | 290 ---- .../convert_vit_hybrid_timm_to_pytorch.py | 282 ---- .../convert_depth_anything_to_hf.py | 368 ----- ..._original_pytorch_checkpoint_to_pytorch.py | 277 ---- .../models/detr/convert_detr_to_pytorch.py | 385 ----- ..._original_pytorch_checkpoint_to_pytorch.py | 46 - .../models/dinov2/convert_dinov2_to_hf.py | 285 ---- .../convert_dinov2_with_registers_to_hf.py | 291 ---- .../dit/convert_dit_unilm_to_pytorch.py | 230 --- .../models/donut/convert_donut_to_pytorch.py | 234 --- ...vert_dpr_original_checkpoint_to_pytorch.py | 143 -- .../models/dpt/convert_dinov2_depth_to_hf.py | 383 ----- .../models/dpt/convert_dpt_beit_to_hf.py | 305 ---- .../dpt/convert_dpt_hybrid_to_pytorch.py | 315 ---- .../models/dpt/convert_dpt_swinv2_to_hf.py | 321 ---- .../models/dpt/convert_dpt_to_pytorch.py | 285 ---- .../convert_efficientnet_to_pytorch.py | 339 ----- ...ectra_original_tf_checkpoint_to_pytorch.py | 79 - .../convert_encodec_checkpoint_to_pytorch.py | 365 ----- src/transformers/models/esm/convert_esm.py | 399 ----- .../falcon/convert_custom_code_checkpoint.py | 74 - ..._original_pytorch_checkpoint_to_pytorch.py | 210 --- .../fastspeech2_conformer/convert_hifigan.py | 134 -- .../convert_model_with_hifigan.py | 102 -- .../flava/convert_dalle_to_flava_codebook.py | 102 -- .../convert_flava_original_pytorch_to_hf.py | 99 -- ...net_original_flax_checkpoint_to_pytorch.py | 156 -- .../focalnet/convert_focalnet_to_hf_format.py | 237 --- ..._original_pytorch_checkpoint_to_pytorch.py | 280 ---- ...unnel_original_tf_checkpoint_to_pytorch.py | 67 - .../fuyu/convert_fuyu_model_weights_to_hf.py | 134 -- .../gemma/convert_gemma_weights_to_hf.py | 206 --- .../gemma2/convert_gemma2_weights_to_hf.py | 239 --- .../models/git/convert_git_to_pytorch.py | 448 ------ .../models/glm/convert_glm_weights_to_hf.py | 195 --- .../models/glpn/convert_glpn_to_pytorch.py | 218 --- ..._gpt2_original_tf_checkpoint_to_pytorch.py | 68 - .../convert_gpt_neo_mesh_tf_to_pytorch.py | 71 - .../gpt_sw3/convert_megatron_to_pytorch.py | 197 --- .../convert_grounding_dino_to_hf.py | 491 ------ .../groupvit/convert_groupvit_nvlab_to_hf.py | 217 --- .../models/hiera/convert_hiera_to_hf.py | 369 ----- ...rt_original_s3prl_checkpoint_to_pytorch.py | 222 --- ..._original_pytorch_checkpoint_to_pytorch.py | 261 ---- ...rt_original_s3prl_checkpoint_to_pytorch.py | 68 - .../convert_idefics2_weights_to_hf.py | 185 --- .../convert_idefics3_weights_to_hf.py | 214 --- .../models/ijepa/convert_ijepa_to_hf.py | 267 ---- ...onvert_imagegpt_original_tf2_to_pytorch.py | 71 - ...onvert_instructblip_original_to_pytorch.py | 303 ---- ...t_instructblipvideo_original_to_pytorch.py | 305 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 77 - .../levit/convert_levit_timm_to_pytorch.py | 180 --- .../llama/convert_llama_weights_to_hf.py | 601 -------- .../llava/convert_llava_weights_to_hf.py | 204 --- .../convert_llava_next_weights_to_hf.py | 397 ----- .../convert_llava_next_video_weights_to_hf.py | 276 ---- .../convert_llava_onevision_weights_to_hf.py | 388 ----- ...r_original_pytorch_lightning_to_pytorch.py | 85 -- .../convert_longt5x_checkpoint_to_flax.py | 215 --- ..._original_pytorch_checkpoint_to_pytorch.py | 170 --- ...xmert_original_tf_checkpoint_to_pytorch.py | 59 - ...t_m2m100_original_checkpoint_to_pytorch.py | 85 -- ...convert_mamba_ssm_checkpoint_to_pytorch.py | 153 -- ...onvert_mamba2_ssm_checkpoint_to_pytorch.py | 193 --- .../convert_marian_tatoeba_to_pytorch.py | 1327 ----------------- .../marian/convert_marian_to_pytorch.py | 717 --------- ..._original_pytorch_checkpoint_to_pytorch.py | 1019 ------------- ..._original_pytorch_checkpoint_to_pytorch.py | 731 --------- .../convert_maskformer_resnet_to_pytorch.py | 390 ----- .../convert_maskformer_swin_to_pytorch.py | 333 ----- ...rt_mbart_original_checkpoint_to_pytorch.py | 83 -- .../convert_megatron_bert_checkpoint.py | 334 ----- .../convert_megatron_gpt2_checkpoint.py | 358 ----- .../convert_mimi_checkpoint_to_pytorch.py | 198 --- .../mistral/convert_mistral_weights_to_hf.py | 276 ---- .../mixtral/convert_mixtral_weights_to_hf.py | 244 --- .../mllama/convert_mllama_weights_to_hf.py | 639 -------- ..._original_pytorch_checkpoint_to_pytorch.py | 229 --- ...ebert_original_tf_checkpoint_to_pytorch.py | 58 - ...nvert_original_tf_checkpoint_to_pytorch.py | 141 -- ...nvert_original_tf_checkpoint_to_pytorch.py | 177 --- .../mobilevit/convert_mlcvnets_to_pytorch.py | 311 ---- .../convert_mlcvnets_to_pytorch.py | 330 ---- .../moshi/convert_moshi_transformers.py | 311 ---- .../mra/convert_mra_pytorch_to_pytorch.py | 110 -- .../musicgen/convert_musicgen_transformers.py | 236 --- .../convert_musicgen_melody_transformers.py | 267 ---- ..._myt5_original_tf_checkpoint_to_pytorch.py | 60 - .../nemotron/convert_nemotron_nemo_to_hf.py | 346 ----- ..._sharded_original_checkpoint_to_pytorch.py | 160 -- .../models/nougat/convert_nougat_to_hf.py | 282 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 111 -- .../models/olmo/convert_olmo_weights_to_hf.py | 248 --- .../olmo2/convert_olmo2_weights_to_hf.py | 304 ---- .../olmoe/convert_olmoe_weights_to_hf.py | 281 ---- .../omdet_turbo/convert_omdet_turbo_to_hf.py | 349 ----- .../oneformer/convert_to_hf_oneformer.py | 1191 --------------- ...penai_original_tf_checkpoint_to_pytorch.py | 74 - ..._original_pytorch_checkpoint_to_pytorch.py | 113 -- .../models/owlv2/convert_owlv2_to_hf.py | 422 ------ .../convert_owlvit_original_flax_to_hf.py | 406 ----- .../convert_paligemma2_weights_to_hf.py | 415 ------ .../convert_paligemma_weights_to_hf.py | 347 ----- .../pegasus/convert_pegasus_tf_to_pytorch.py | 131 -- .../convert_perceiver_haiku_to_pytorch.py | 468 ------ .../convert_persimmon_weights_to_hf.py | 129 -- .../models/phi/convert_phi_weights_to_hf.py | 207 --- ...nvert_pix2struct_original_pytorch_to_hf.py | 155 -- .../pixtral/convert_pixtral_weights_to_hf.py | 319 ---- ...ert_plbart_original_checkpoint_to_torch.py | 94 -- .../convert_poolformer_original_to_pytorch.py | 214 --- .../convert_pop2piano_weights_to_hf.py | 190 --- ..._original_pytorch_checkpoint_to_pytorch.py | 159 -- .../models/pvt/convert_pvt_to_pytorch.py | 226 --- .../pvt_v2/convert_pvt_v2_to_pytorch.py | 295 ---- .../convert_recurrent_gemma_to_hf.py | 222 --- ...ert_reformer_trax_checkpoint_to_pytorch.py | 226 --- .../convert_regnet_seer_10b_to_pytorch.py | 304 ---- .../regnet/convert_regnet_to_pytorch.py | 458 ------ ...onvert_rembert_tf_checkpoint_to_pytorch.py | 62 - .../resnet/convert_resnet_to_pytorch.py | 199 --- ..._original_pytorch_checkpoint_to_pytorch.py | 177 --- ..._original_pytorch_checkpoint_to_pytorch.py | 77 - ...ormer_original_tf_checkpoint_to_pytorch.py | 62 - ..._detr_original_pytorch_checkpoint_to_hf.py | 782 ---------- .../rwkv/convert_rwkv_checkpoint_to_hf.py | 209 --- .../models/sam/convert_sam_to_hf.py | 251 ---- .../seamless_m4t/convert_fairseq2_to_hf.py | 396 ----- .../seamless_m4t_v2/convert_fairseq2_to_hf.py | 404 ----- .../convert_segformer_original_to_pytorch.py | 387 ----- .../models/seggpt/convert_seggpt_to_hf.py | 221 --- ..._original_pytorch_checkpoint_to_pytorch.py | 305 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 317 ---- .../models/siglip/convert_siglip_to_hf.py | 412 ----- ...rt_wav2vec2_seq2seq_original_to_pytorch.py | 357 ----- ...xt_wav2vec2_seq2seq_original_to_pytorch.py | 316 ---- .../convert_s2t_fairseq_to_tfms.py | 121 -- .../models/speecht5/convert_hifigan.py | 108 -- ..._original_pytorch_checkpoint_to_pytorch.py | 401 ----- .../convert_superpoint_to_pytorch.py | 175 --- .../convert_swiftformer_original_to_hf.py | 175 --- .../swin/convert_swin_simmim_to_pytorch.py | 182 --- .../swin/convert_swin_timm_to_pytorch.py | 173 --- .../convert_swin2sr_original_to_pytorch.py | 278 ---- .../swinv2/convert_swinv2_timm_to_pytorch.py | 219 --- .../switch_transformers/convert_big_switch.py | 193 --- ...ers_original_flax_checkpoint_to_pytorch.py | 203 --- ...rt_t5_original_tf_checkpoint_to_pytorch.py | 59 - .../t5/convert_t5x_checkpoint_to_flax.py | 235 --- .../t5/convert_t5x_checkpoint_to_pytorch.py | 238 --- .../convert_table_transformer_to_hf.py | 317 ---- ...convert_table_transformer_to_hf_no_timm.py | 434 ------ ...tapas_original_tf_checkpoint_to_pytorch.py | 137 -- .../models/textnet/convert_textnet_to_hf.py | 208 --- .../convert_timesformer_to_pytorch.py | 253 ---- .../trocr/convert_trocr_unilm_to_pytorch.py | 237 --- .../models/udop/convert_udop_to_hf.py | 224 --- .../convert_umt5_checkpoint_to_pytorch.py | 274 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 273 ---- ...ch_original_s3prl_checkpoint_to_pytorch.py | 109 -- ..._original_pytorch_checkpoint_to_pytorch.py | 224 --- .../models/univnet/convert_univnet.py | 162 -- .../convert_convnext_upernet_to_pytorch.py | 214 --- .../convert_swin_upernet_to_pytorch.py | 297 ---- .../convert_video_llava_weights_to_hf.py | 159 -- .../videomae/convert_videomae_to_pytorch.py | 324 ---- .../vilt/convert_vilt_original_to_pytorch.py | 299 ---- .../convert_vipllava_weights_to_hf.py | 132 -- ..._original_pytorch_checkpoint_to_pytorch.py | 149 -- .../models/vit/convert_dino_to_pytorch.py | 218 --- .../models/vit/convert_vit_timm_to_pytorch.py | 254 ---- .../vit_mae/convert_vit_mae_to_pytorch.py | 178 --- .../models/vit_msn/convert_msn_to_pytorch.py | 245 --- .../models/vitmatte/convert_vitmatte_to_hf.py | 170 --- .../models/vitpose/convert_vitpose_to_hf.py | 355 ----- .../vits/convert_original_checkpoint.py | 390 ----- .../vivit/convert_vivit_flax_to_pytorch.py | 231 --- ..._original_pytorch_checkpoint_to_pytorch.py | 385 ----- ...c2_original_s3prl_checkpoint_to_pytorch.py | 109 -- .../convert_wav2vec2_seamless_checkpoint.py | 217 --- ..._original_pytorch_checkpoint_to_pytorch.py | 309 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 206 --- ...lm_original_s3prl_checkpoint_to_pytorch.py | 109 -- .../models/whisper/convert_openai_to_hf.py | 370 ----- .../convert_x_clip_original_pytorch_to_hf.py | 386 ----- .../convert_xglm_original_ckpt_to_trfms.py | 68 - ..._original_pytorch_checkpoint_to_pytorch.py | 77 - ..._original_pytorch_checkpoint_to_pytorch.py | 183 --- ...xlnet_original_tf_checkpoint_to_pytorch.py | 113 -- ..._original_pytorch_checkpoint_to_pytorch.py | 212 --- .../models/yolos/convert_yolos_to_pytorch.py | 267 ---- .../yoso/convert_yoso_pytorch_to_pytorch.py | 108 -- .../models/zoedepth/convert_zoedepth_to_hf.py | 426 ------ 297 files changed, 53 insertions(+), 61511 deletions(-) delete mode 100644 src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/align/convert_align_tf_to_hf.py delete mode 100644 src/transformers/models/aria/convert_aria_weights_to_hf.py delete mode 100644 src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py delete mode 100644 src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py delete mode 100644 src/transformers/models/bark/convert_suno_to_hf.py delete mode 100644 src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/beit/convert_beit_unilm_to_pytorch.py delete mode 100644 src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py delete mode 100644 src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py delete mode 100755 src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bit/convert_bit_to_pytorch.py delete mode 100644 src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py delete mode 100644 src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bros/convert_bros_to_pytorch.py delete mode 100755 src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py delete mode 100644 src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clvp/convert_clvp_to_hf.py delete mode 100644 src/transformers/models/colpali/convert_colpali_weights_to_hf.py delete mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py delete mode 100644 src/transformers/models/convnext/convert_convnext_to_pytorch.py delete mode 100644 src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py delete mode 100644 src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dac/convert_dac_checkpoint.py delete mode 100644 src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py delete mode 100644 src/transformers/models/deit/convert_deit_timm_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/jukebox/convert_jukebox.py delete mode 100644 src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/van/convert_van_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py delete mode 100644 src/transformers/models/depth_anything/convert_depth_anything_to_hf.py delete mode 100644 src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/detr/convert_detr_to_pytorch.py delete mode 100644 src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dinov2/convert_dinov2_to_hf.py delete mode 100644 src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py delete mode 100644 src/transformers/models/dit/convert_dit_unilm_to_pytorch.py delete mode 100644 src/transformers/models/donut/convert_donut_to_pytorch.py delete mode 100644 src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dpt/convert_dinov2_depth_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_beit_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py delete mode 100644 src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_to_pytorch.py delete mode 100644 src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py delete mode 100644 src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/esm/convert_esm.py delete mode 100644 src/transformers/models/falcon/convert_custom_code_checkpoint.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_hifigan.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py delete mode 100644 src/transformers/models/flava/convert_dalle_to_flava_codebook.py delete mode 100644 src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/focalnet/convert_focalnet_to_hf_format.py delete mode 100755 src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py delete mode 100644 src/transformers/models/gemma/convert_gemma_weights_to_hf.py delete mode 100644 src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py delete mode 100644 src/transformers/models/git/convert_git_to_pytorch.py delete mode 100644 src/transformers/models/glm/convert_glm_weights_to_hf.py delete mode 100644 src/transformers/models/glpn/convert_glpn_to_pytorch.py delete mode 100755 src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py delete mode 100644 src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py delete mode 100644 src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py delete mode 100644 src/transformers/models/hiera/convert_hiera_to_hf.py delete mode 100644 src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py delete mode 100644 src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py delete mode 100644 src/transformers/models/ijepa/convert_ijepa_to_hf.py delete mode 100644 src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py delete mode 100644 src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py delete mode 100644 src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py delete mode 100644 src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/levit/convert_levit_timm_to_pytorch.py delete mode 100644 src/transformers/models/llama/convert_llama_weights_to_hf.py delete mode 100644 src/transformers/models/llava/convert_llava_weights_to_hf.py delete mode 100644 src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py delete mode 100644 src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py delete mode 100644 src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py delete mode 100644 src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py delete mode 100644 src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py delete mode 100644 src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py delete mode 100644 src/transformers/models/marian/convert_marian_to_pytorch.py delete mode 100644 src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py delete mode 100644 src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py delete mode 100644 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py delete mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mistral/convert_mistral_weights_to_hf.py delete mode 100644 src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py delete mode 100644 src/transformers/models/mllama/convert_mllama_weights_to_hf.py delete mode 100644 src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py delete mode 100644 src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py delete mode 100644 src/transformers/models/moshi/convert_moshi_transformers.py delete mode 100644 src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py delete mode 100644 src/transformers/models/musicgen/convert_musicgen_transformers.py delete mode 100644 src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py delete mode 100644 src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py delete mode 100644 src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/nougat/convert_nougat_to_hf.py delete mode 100644 src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/olmo/convert_olmo_weights_to_hf.py delete mode 100644 src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py delete mode 100644 src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py delete mode 100644 src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py delete mode 100644 src/transformers/models/oneformer/convert_to_hf_oneformer.py delete mode 100755 src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/owlv2/convert_owlv2_to_hf.py delete mode 100644 src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py delete mode 100644 src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py delete mode 100644 src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py delete mode 100644 src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py delete mode 100644 src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py delete mode 100644 src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py delete mode 100644 src/transformers/models/phi/convert_phi_weights_to_hf.py delete mode 100644 src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py delete mode 100644 src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py delete mode 100644 src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py delete mode 100644 src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py delete mode 100644 src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/pvt/convert_pvt_to_pytorch.py delete mode 100644 src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py delete mode 100644 src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py delete mode 100755 src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py delete mode 100644 src/transformers/models/regnet/convert_regnet_to_pytorch.py delete mode 100755 src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/resnet/convert_resnet_to_pytorch.py delete mode 100644 src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py delete mode 100644 src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py delete mode 100644 src/transformers/models/sam/convert_sam_to_hf.py delete mode 100644 src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py delete mode 100644 src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py delete mode 100644 src/transformers/models/segformer/convert_segformer_original_to_pytorch.py delete mode 100644 src/transformers/models/seggpt/convert_seggpt_to_hf.py delete mode 100644 src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/siglip/convert_siglip_to_hf.py delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py delete mode 100644 src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py delete mode 100644 src/transformers/models/speecht5/convert_hifigan.py delete mode 100644 src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/superpoint/convert_superpoint_to_pytorch.py delete mode 100644 src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py delete mode 100644 src/transformers/models/swin/convert_swin_simmim_to_pytorch.py delete mode 100644 src/transformers/models/swin/convert_swin_timm_to_pytorch.py delete mode 100644 src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py delete mode 100644 src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py delete mode 100644 src/transformers/models/switch_transformers/convert_big_switch.py delete mode 100644 src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py delete mode 100755 src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf.py delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py delete mode 100644 src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/textnet/convert_textnet_to_hf.py delete mode 100644 src/transformers/models/timesformer/convert_timesformer_to_pytorch.py delete mode 100644 src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py delete mode 100644 src/transformers/models/udop/convert_udop_to_hf.py delete mode 100644 src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/univnet/convert_univnet.py delete mode 100644 src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py delete mode 100644 src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py delete mode 100644 src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py delete mode 100644 src/transformers/models/videomae/convert_videomae_to_pytorch.py delete mode 100644 src/transformers/models/vilt/convert_vilt_original_to_pytorch.py delete mode 100644 src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py delete mode 100644 src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/vit/convert_dino_to_pytorch.py delete mode 100644 src/transformers/models/vit/convert_vit_timm_to_pytorch.py delete mode 100644 src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py delete mode 100644 src/transformers/models/vit_msn/convert_msn_to_pytorch.py delete mode 100644 src/transformers/models/vitmatte/convert_vitmatte_to_hf.py delete mode 100644 src/transformers/models/vitpose/convert_vitpose_to_hf.py delete mode 100644 src/transformers/models/vits/convert_original_checkpoint.py delete mode 100644 src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py delete mode 100644 src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/whisper/convert_openai_to_hf.py delete mode 100644 src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py delete mode 100755 src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/yolos/convert_yolos_to_pytorch.py delete mode 100644 src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py delete mode 100644 src/transformers/models/zoedepth/convert_zoedepth_to_hf.py diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index ee155e377e41..87496f95a1d2 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 095af99efffc..590bf5a0518c 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index ddbde78f703c..a7d5c0a0c912 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 5f1988c36de1..7652fc2355a9 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index ef308316569b..650b088b302e 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index d42fb52d5c13..9f9f5decf6a9 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 111d8adce8b4..ddf35750d447 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 6cbcac0a7e68..6295cb46e551 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index f23e55191709..fd876fdfcfc2 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 9d052076b7b1..654a9be30bcd 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 100a1365c2e9..9c31fe31ac0c 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 806330fb72d1..138f61bc4631 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index d888b7853dd4..9f2f2347b889 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 10bfee8f25f7..442e246246c0 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 078b0add065c..5fc24dd81bee 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index cac845f3a055..d1ea873ad9a1 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 0a0e10511fa2..6b5673088ad5 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 8cb30099491a..db7565a21bf8 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 0bff38707d56..8bbe6fe9edad 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 20763558a5f6..0cb9abf487ef 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index f188e4e476a2..d2bee272db76 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 2d4e8bdbb92c..6c40f9ecb1d3 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 07fcb36acb15..4ae9b39f6ea6 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 33ad0499301e..9f3f7eefd81e 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 6de464f43670..0c7a124a10f9 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index c3e12ac9edef..27b235c7bb41 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 8e791564b007..2b98a6e11d97 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 6ccce481b548..77b3fda276df 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index c1874b3fe18e..7096bf12ba9b 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 6f77f8256417..e89272226ef0 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 16e64eb92343..8d92330f7c68 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 9eb3498c8c17..fdbbe306bd90 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 682d3b16d216..f41a63c2b368 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index e6a643e42139..eff37d156a26 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index a2d09f200047..e557a82377da 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ab5ab7adb19c..cd1d3e768bcf 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index dae845b119b1..a314acfc71ff 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 2a99bc42e119..7af2359bdcf2 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 8da7e86d8755..ef50dfd5916d 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index c76f83ce4def..de0157546ac6 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 056db7167280..bd7f315f8dec 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 56e3a1e646db..df793ad689a5 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index dadfcb80941e..18247f875e56 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index df4c1e9557a9..e9fd6da5f3c7 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 01c31de8730b..fc8b5f20c2b4 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 296c70549bda..4568f06bf44e 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index b35d761d8a6a..9a4faf78d84d 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index a78a5d89e19f..e83d8156e7cf 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 92a10990d160..0c6f36fd5c8c 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index a8f2de825cc2..a5d942d2d74b 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 1afb72cf1098..bd62dbb615b7 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.48.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index 1a4d94c24e8c..c6d12f87b78c 100644 --- a/setup.py +++ b/setup.py @@ -437,7 +437,7 @@ def run(self): setup( name="transformers", - version="4.48.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.48.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2b4980306c53..d0d8babcb242 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.48.0.dev0" +__version__ = "4.48.0" from typing import TYPE_CHECKING diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index df2a22610187..000000000000 --- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALBERT checkpoint.""" - -import argparse - -import torch - -from ...utils import logging -from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = AlbertConfig.from_json_file(albert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = AlbertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_albert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--albert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ALBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py deleted file mode 100644 index 610db8482f91..000000000000 --- a/src/transformers/models/align/convert_align_tf_to_hf.py +++ /dev/null @@ -1,389 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALIGN checkpoints from the original repository.""" - -import argparse -import os - -import align -import numpy as np -import requests -import tensorflow as tf -import torch -from PIL import Image -from tokenizer import Tokenizer - -from transformers import ( - AlignConfig, - AlignModel, - AlignProcessor, - BertConfig, - BertTokenizer, - EfficientNetConfig, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def preprocess(image): - image = tf.image.resize(image, (346, 346)) - image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289) - return image - - -def get_align_config(): - vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7") - vision_config.image_size = 289 - vision_config.hidden_dim = 640 - vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"} - vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1} - vision_config.depthwise_padding = [] - - text_config = BertConfig() - config = AlignConfig.from_text_vision_configs( - text_config=text_config, vision_config=vision_config, projection_dim=640 - ) - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_processor(): - image_processor = EfficientNetImageProcessor( - do_center_crop=True, - rescale_factor=1 / 127.5, - rescale_offset=True, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - tokenizer.model_max_length = 64 - processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer) - return processor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - # EfficientNet image encoder - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = list(set(block_names)) - block_names = sorted(block_names) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "vision_model." + item[1] - - # BERT text encoder - rename_keys = [] - old = "tf_bert_model/bert" - new = "text_model" - for i in range(12): - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.query.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/bias:0", - f"{new}.encoder.layer.{i}.attention.self.query.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.key.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/bias:0", - f"{new}.encoder.layer.{i}.attention.self.key.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.value.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/bias:0", - f"{new}.encoder.layer.{i}.attention.self.value.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0", - f"{new}.encoder.layer.{i}.attention.output.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0", - f"{new}.encoder.layer.{i}.attention.output.dense.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0", - f"{new}.encoder.layer.{i}.intermediate.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0", - f"{new}.encoder.layer.{i}.intermediate.dense.bias", - ) - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias") - ) - - rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight")) - rename_keys.append( - (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight") - ) - rename_keys.append( - (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight") - ) - rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight")) - rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias")) - - rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight")) - rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias")) - rename_keys.append(("dense/kernel:0", "text_projection.weight")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("temperature:0", "temperature")) - - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = item[1] - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - list(hf_params.keys()) - - for key, value in tf_params.items(): - if key not in key_mapping: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "embeddings" in key: - new_hf_value = torch.from_numpy(value) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - elif "temperature" in key: - new_hf_value = value - elif "bn/gamma" or "bn/beta" in key: - new_hf_value = torch.from_numpy(np.transpose(value)).squeeze() - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ALIGN structure. - """ - # Load original model - seq_length = 64 - tok = Tokenizer(seq_length) - original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size()) - original_model.compile() - original_model.load_weights(checkpoint_path) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_align_config() - hf_model = AlignModel(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize processor - processor = get_processor() - inputs = processor( - images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt" - ) - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - - hf_image_features = outputs.image_embeds.detach().numpy() - hf_text_features = outputs.text_embeds.detach().numpy() - - # Original model inference - original_model.trainable = False - tf_image_processor = EfficientNetImageProcessor( - do_center_crop=True, - do_rescale=False, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"] - text = tok(tf.constant(["A picture of a cat"])) - - image_features = original_model.image_encoder(image, training=False) - text_features = original_model.text_encoder(text, training=False) - - image_features = tf.nn.l2_normalize(image_features, axis=-1) - text_features = tf.nn.l2_normalize(text_features, axis=-1) - - # Check whether original and HF model outputs match -> np.allclose - if not np.allclose(image_features, hf_image_features, atol=1e-3): - raise ValueError("The predicted image features are not the same.") - if not np.allclose(text_features, hf_text_features, atol=1e-3): - raise ValueError("The predicted text features are not the same.") - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print("Pushing converted ALIGN to the hub...") - processor.push_to_hub("align-base") - hf_model.push_to_hub("align-base") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", - default="./weights/model-weights", - type=str, - help="Path to the pretrained TF ALIGN checkpoint.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py deleted file mode 100644 index dcc9e4d13976..000000000000 --- a/src/transformers/models/aria/convert_aria_weights_to_hf.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import glob - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AriaForConditionalGeneration, - AriaProcessor, - AutoConfig, - AutoTokenizer, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria - -Example for creating the old state dict file with Python: - - import torch - from aria.model.language_model.aria_llama import AriaTextForCausalLM - - # load model - kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) - - # load vision tower - model.get_vision_tower().load_model() - - # Save state dict - torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") -""" - -KEYS_TO_MODIFY_MAPPING = { - "vision_tower.vision_model": "vision_tower", - "ln_ffn": "layer_norm", - "ffn": "feed_forward", - "ln_kv": "layer_norm_kv", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) - new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) - - return new_state_dict - - -def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): - torch.set_default_dtype(torch.float16) - - tokenizer = AutoTokenizer.from_pretrained( - text_model_id, - extra_special_tokens={ - "image_token": "<|img|>", - "pad_token": "", - }, - ) - tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) - tokenizer.add_special_tokens({"pad_token": ""}) - tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - - processor = AriaProcessor.from_pretrained( - text_model_id, - tokenizer=tokenizer, - ) - - config = AutoConfig.from_pretrained(text_model_id) - config.vision_config.hidden_size = 1152 - config.vision_config.attention_heads = 16 - config.pad_token_id = 2 - config.image_token_index = 9 - config.intermediate_size = config.moe_intermediate_size - config.auto_map = { - "AutoConfig": "modeling_aria.AriaConfig", - "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", - } - - with torch.device("meta"): - model = AriaForConditionalGeneration(config) - - state_dict = load_original_state_dict(old_state_dict_id) - - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, strict=False, assign=True) - - # print("Saving models") - # model.save_pretrained("local_aria", safe_serialization=False) - # processor.save_pretrained("local_aria") - print("Pushing to hub") - model.push_to_hub(output_hub_path, create_pr=True) - processor.push_to_hub(output_hub_path, create_pr=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--text_model_id", - default="rhymes-ai/Aria", - help="Hub location of the text model", - ) - parser.add_argument( - "--vision_model_id", - default="rhymes-ai/Aria", - help="Hub location of the vision model", - ) - parser.add_argument( - "--output_hub_path", - default="rhymes-ai/Aria", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--old_state_dict_id", - default="rhymes-ai/Aria", - help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", - ) - args = parser.parse_args() - convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py deleted file mode 100644 index d211ef7ab058..000000000000 --- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast""" - -import argparse -import json -from pathlib import Path - -import torch -import torchaudio -from datasets import load_dataset -from huggingface_hub import hf_hub_download - -from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_audio_spectrogram_transformer_config(model_name): - config = ASTConfig() - - if "10-10" in model_name: - pass - elif "speech-commands" in model_name: - config.max_length = 128 - elif "12-12" in model_name: - config.time_stride = 12 - config.frequency_stride = 12 - elif "14-14" in model_name: - config.time_stride = 14 - config.frequency_stride = 14 - elif "16-16" in model_name: - config.time_stride = 16 - config.frequency_stride = 16 - else: - raise ValueError("Model not supported") - - repo_id = "huggingface/label-files" - if "speech-commands" in model_name: - config.num_labels = 35 - filename = "speech-commands-v2-id2label.json" - else: - config.num_labels = 527 - filename = "audioset-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -def rename_key(name): - if "module.v" in name: - name = name.replace("module.v", "audio_spectrogram_transformer") - if "cls_token" in name: - name = name.replace("cls_token", "embeddings.cls_token") - if "dist_token" in name: - name = name.replace("dist_token", "embeddings.distillation_token") - if "pos_embed" in name: - name = name.replace("pos_embed", "embeddings.position_embeddings") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - # transformer blocks - if "blocks" in name: - name = name.replace("blocks", "encoder.layer") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - # final layernorm - if "audio_spectrogram_transformer.norm" in name: - name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm") - # classifier head - if "module.mlp_head.0" in name: - name = name.replace("module.mlp_head.0", "classifier.layernorm") - if "module.mlp_head.1" in name: - name = name.replace("module.mlp_head.1", "classifier.dense") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.hidden_size - if "weight" in key: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight" - ] = val[:dim, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias" - ] = val[:dim] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias" - ] = val[-dim:] - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def remove_keys(state_dict): - ignore_keys = [ - "module.v.head.weight", - "module.v.head.bias", - "module.v.head_dist.weight", - "module.v.head_dist.bias", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -@torch.no_grad() -def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure. - """ - config = get_audio_spectrogram_transformer_config(model_name) - - model_name_to_url = { - "ast-finetuned-audioset-10-10-0.4593": ( - "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.450": ( - "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448": ( - "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448-v2": ( - "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1" - ), - "ast-finetuned-audioset-12-12-0.447": ( - "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1" - ), - "ast-finetuned-audioset-14-14-0.443": ( - "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1" - ), - "ast-finetuned-audioset-16-16-0.442": ( - "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1" - ), - "ast-finetuned-speech-commands-v2": ( - "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1" - ), - } - - # load original state_dict - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove some keys - remove_keys(state_dict) - # rename some keys - new_state_dict = convert_state_dict(state_dict, config) - - # load đŸ€— model - model = ASTForAudioClassification(config) - model.eval() - - model.load_state_dict(new_state_dict) - - # verify outputs on dummy input - # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62 - mean = -4.2677393 if "speech-commands" not in model_name else -6.845978 - std = 4.5689974 if "speech-commands" not in model_name else 5.5654526 - max_length = 1024 if "speech-commands" not in model_name else 128 - feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) - - if "speech-commands" in model_name: - # TODO: Convert dataset to Parquet - dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True) - waveform = dataset[0]["audio"]["array"] - else: - filepath = hf_hub_download( - repo_id="nielsr/audio-spectogram-transformer-checkpoint", - filename="sample_audio.flac", - repo_type="dataset", - ) - - waveform, _ = torchaudio.load(filepath) - waveform = waveform.squeeze().numpy() - - inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt") - - # forward pass - outputs = model(**inputs) - logits = outputs.logits - - if model_name == "ast-finetuned-audioset-10-10-0.4593": - expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]) - elif model_name == "ast-finetuned-audioset-10-10-0.450": - expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718]) - elif model_name == "ast-finetuned-audioset-10-10-0.448": - expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344]) - elif model_name == "ast-finetuned-audioset-10-10-0.448-v2": - expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917]) - elif model_name == "ast-finetuned-audioset-12-12-0.447": - expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843]) - elif model_name == "ast-finetuned-audioset-14-14-0.443": - expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413]) - elif model_name == "ast-finetuned-audioset-16-16-0.442": - expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470]) - elif model_name == "ast-finetuned-speech-commands-v2": - expected_slice = torch.tensor([6.1589, -8.0566, -8.7984]) - else: - raise ValueError("Unknown model name") - if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4): - raise ValueError("Logits don't match") - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving feature extractor to {pytorch_dump_folder_path}") - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and feature extractor to the hub...") - model.push_to_hub(f"MIT/{model_name}") - feature_extractor.push_to_hub(f"MIT/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ast-finetuned-audioset-10-10-0.4593", - type=str, - help="Name of the Audio Spectrogram Transformer model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py deleted file mode 100644 index a7b8cfc78290..000000000000 --- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" - -import argparse -import json -import os -import re -from os import path -from typing import Dict, Union - -import torch -from huggingface_hub import split_torch_state_dict_into_shards -from safetensors.torch import save_file - -from transformers import AutoTokenizer -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME - -from .configuration_bamba import BambaConfig - - -def convert_state_dict_from_mamba_ssm(original_sd: Dict) -> Dict[str, torch.Tensor]: - state_dict = {} - - for orig_k, param in original_sd.items(): - k = orig_k.replace("backbone", "model") - - # for embeddings - k = k.replace("embedding", "embed_tokens") - - # for mixer - k = k.replace("mixer", "mamba") - - # for final layernorm - k = k.replace("norm_f", "final_layernorm") - - # for block layernorm - k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k) - k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k) - - # for mlp - k = k.replace("mlp.fc2", "feed_forward.down_proj") - - if "mlp.fc1" in k: - param, param2 = torch.chunk(param, 2, dim=0) - k2 = k.replace("mlp.fc1", "feed_forward.gate_proj") - state_dict[k2] = param2 - k = k.replace("mlp.fc1", "feed_forward.up_proj") - - if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or ( - "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd - ): - # then this must be a mamba - pass - else: - # for attn - # - because mixer was replaced to mamba above - k = k.replace("mamba.out_proj", "self_attn.o_proj") - if "mamba.in_proj" in k: - m, n = param.shape - d = (m - n) // 2 - param, param2, param3 = torch.split(param, [n, d, d], dim=0) - k2 = k.replace("mamba.in_proj", "self_attn.k_proj") - state_dict[k2] = param2 - k2 = k.replace("mamba.in_proj", "self_attn.v_proj") - state_dict[k2] = param3 - k = k.replace("mamba.in_proj", "self_attn.q_proj") - - state_dict[k] = param - - return state_dict - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_ssm_config_to_hf_config( - config_ssm: Dict, - **kwargs, -) -> BambaConfig: - """Convert a config from mamba_ssm to a BambaConfig from here.""" - hf_config: BambaConfig = BambaConfig(**kwargs) - - hf_config.architectures = ["BambaForCausalLM"] - - # Set important values from config and recalculate other resulting entries - hf_config.hidden_size = config_ssm["d_model"] - hf_config.intermediate_size = config_ssm["d_intermediate"] - hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head - hf_config.num_hidden_layers = config_ssm["n_layer"] - hf_config.tie_word_embeddings = config_ssm["tie_embeddings"] - - # currently this script assumes config_ssm belongs to v2 - if config_ssm["ssm_cfg"].get("layer") != "Mamba2": - raise ValueError("Conversion script only supports Mamba2") - - # Set attention values - attn_cfg = config_ssm.get("attn_cfg") - if attn_cfg: - assert attn_cfg["causal"], "Only support non-causal attention." - assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias." - assert not attn_cfg["out_proj_bias"], "Only support no out bias." - hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"] - hf_config.num_attention_heads = attn_cfg["num_heads"] - hf_config.num_key_value_heads = attn_cfg["num_heads_kv"] - - attention_layer_indices = config_ssm.get("attn_layer_idx") - if attention_layer_indices: - hf_config.attn_layer_indices = attention_layer_indices - - # Padded vocab size, mostly of 16 but 32 is also very common in different models - vocab_size = config_ssm["vocab_size"] - pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"] - if (vocab_size % pad_vocab_size_multiple) != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) - hf_config.vocab_size = vocab_size - - return hf_config - - -def save_single_safetensor( - state_dict: Dict, - save_directory: str, - metadata: Dict, -): - save_file( - state_dict, - os.path.join(save_directory, SAFE_WEIGHTS_NAME), - metadata, - ) - - -def save_sharded_safetensors( - state_dict: Dict, - save_directory: str, - metadata: Dict, - max_shard_size: Union[int, str] = "5GB", -): - filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace( - ".safetensors", "{suffix}.safetensors" - ) - state_dict_split = split_torch_state_dict_into_shards( - state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size - ) - index = { - "metadata": state_dict_split.metadata, - "weight_map": state_dict_split.tensor_to_filename, - } - # Save the index - with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f: - content = json.dumps(index, indent=2, sort_keys=True) + "\n" - f.write(content) - - filename_to_tensors = state_dict_split.filename_to_tensors.items() - for shard_file, tensors in filename_to_tensors: - shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} - save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata) - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - mamba_ssm_checkpoint_path: str, - precision: str, - output_dir: str, - tokenizer_path: str = None, - save_model: Union[bool, str] = True, -) -> None: - # load tokenizer if provided, this will be used to set the - # token_ids in the config file - token_ids = {} - if tokenizer_path: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - for key in [ - "bos_token_id", - "eos_token_id", - "pad_token_id", - ]: - id = getattr(tokenizer, key, None) - if id: - token_ids[key] = id - - # there are some configs unsettable by mamba_ssn config, so - # if there are changes from the defaults, have to pass them into - # the function - unsettables = { - "mamba_d_head": 64, - "mamba_d_state": 128, - "mamba_n_groups": 1, - "rms_norm_eps": 1e-5, - } - - # Load and save config based on name - config_path = path.join(mamba_ssm_checkpoint_path, "config.json") - with open(config_path, "r", encoding="utf-8") as json_file: - config = json.load(json_file) - - # convert the config - hf_config = convert_ssm_config_to_hf_config( - config_ssm=config, - **token_ids, - **unsettables, - ) - hf_config.save_pretrained(output_dir) - - # Load state dict of the original model and transfer to hf model - state_dict = torch.load( - path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"), - map_location="cpu", - weights_only=True, - ) - # FIXME: allow other parameters to pass in - state_dict = convert_state_dict_from_mamba_ssm(state_dict) - - # Save new model to pytorch_dump_path - dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16) - - save_file_fn = None - if isinstance(save_model, bool) and save_model: - save_file_fn = save_single_safetensor - elif isinstance(save_model, str) and save_model == "sharded": - save_file_fn = save_sharded_safetensors - - if save_file_fn: - save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"}) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba_ssm_checkpoint_directory", - type=str, - required=True, - help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-p", - "--precision", - type=str, - default="fp16", - const="fp16", - required=True, - choices=("fp32", "fp16", "bf16"), - help="The precision the model will be saved in. Select from fp32, fp16 or bf16.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - parser.add_argument( - "-t", - "--tokenizer_model_path", - type=str, - default=None, - required=False, - help="Path to a the tokenizer file.", - ) - args = parser.parse_args() - - convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - args.mamba2_checkpoint_directory, - args.precision, - args.output_dir, - ) diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py deleted file mode 100644 index 880debe60ae4..000000000000 --- a/src/transformers/models/bark/convert_suno_to_hf.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Convert Bark checkpoint.""" - -import argparse -import os -from pathlib import Path - -import torch -from bark.generation import _load_model as _bark_load_model -from huggingface_hub import hf_hub_download - -from transformers import EncodecConfig, EncodecModel, set_seed -from transformers.models.bark.configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, -) -from transformers.models.bark.generation_configuration_bark import ( - BarkCoarseGenerationConfig, - BarkFineGenerationConfig, - BarkGenerationConfig, - BarkSemanticGenerationConfig, -) -from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -set_seed(770) - - -new_layer_name_dict = { - "c_attn": "att_proj", - "c_proj": "out_proj", - "c_fc": "in_proj", - "transformer.": "", - "h.": "layers.", - "ln_1": "layernorm_1", - "ln_2": "layernorm_2", - "ln_f": "layernorm_final", - "wpe": "position_embeds_layer", - "wte": "input_embeds_layer", -} - - -REMOTE_MODEL_PATHS = { - "text_small": { - "repo_id": "suno/bark", - "file_name": "text.pt", - }, - "coarse_small": { - "repo_id": "suno/bark", - "file_name": "coarse.pt", - }, - "fine_small": { - "repo_id": "suno/bark", - "file_name": "fine.pt", - }, - "text": { - "repo_id": "suno/bark", - "file_name": "text_2.pt", - }, - "coarse": { - "repo_id": "suno/bark", - "file_name": "coarse_2.pt", - }, - "fine": { - "repo_id": "suno/bark", - "file_name": "fine_2.pt", - }, -} - -CUR_PATH = os.path.dirname(os.path.abspath(__file__)) -default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache") -CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") - - -def _get_ckpt_path(model_type, use_small=False): - key = model_type - if use_small: - key += "_small" - return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"]) - - -def _download(from_hf_path, file_name): - os.makedirs(CACHE_DIR, exist_ok=True) - hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR) - - -def _load_model(ckpt_path, device, use_small=False, model_type="text"): - if model_type == "text": - ModelClass = BarkSemanticModel - ConfigClass = BarkSemanticConfig - GenerationConfigClass = BarkSemanticGenerationConfig - elif model_type == "coarse": - ModelClass = BarkCoarseModel - ConfigClass = BarkCoarseConfig - GenerationConfigClass = BarkCoarseGenerationConfig - elif model_type == "fine": - ModelClass = BarkFineModel - ConfigClass = BarkFineConfig - GenerationConfigClass = BarkFineGenerationConfig - else: - raise NotImplementedError() - model_key = f"{model_type}_small" if use_small else model_type - model_info = REMOTE_MODEL_PATHS[model_key] - if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") - _download(model_info["repo_id"], model_info["file_name"]) - checkpoint = torch.load(ckpt_path, map_location=device) - # this is a hack - model_args = checkpoint["model_args"] - if "input_vocab_size" not in model_args: - model_args["input_vocab_size"] = model_args["vocab_size"] - model_args["output_vocab_size"] = model_args["vocab_size"] - del model_args["vocab_size"] - - # convert Bark model arguments to HF Bark model arguments - model_args["num_heads"] = model_args.pop("n_head") - model_args["hidden_size"] = model_args.pop("n_embd") - model_args["num_layers"] = model_args.pop("n_layer") - - model_config = ConfigClass(**checkpoint["model_args"]) - model = ModelClass(config=model_config) - model_generation_config = GenerationConfigClass() - - model.generation_config = model_generation_config - state_dict = checkpoint["model"] - # fixup checkpoint - unwanted_prefix = "_orig_mod." - for k, v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - # replace part of the key with corresponding layer name in HF implementation - new_k = k[len(unwanted_prefix) :] - for old_layer_name in new_layer_name_dict: - new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name]) - - state_dict[new_k] = state_dict.pop(k) - - extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) - extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")} - missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) - missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")} - if len(extra_keys) != 0: - raise ValueError(f"extra keys found: {extra_keys}") - if len(missing_keys) != 0: - raise ValueError(f"missing keys: {missing_keys}") - model.load_state_dict(state_dict, strict=False) - n_params = model.num_parameters(exclude_embeddings=True) - val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") - model.eval() - model.to(device) - del checkpoint, state_dict - - return model - - -def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"): - if model_type not in ("text", "coarse", "fine"): - raise NotImplementedError() - - device = "cpu" # do conversion on cpu - - ckpt_path = _get_ckpt_path(model_type, use_small=use_small) - model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small) - - # load bark initial model - bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small) - - if model_type == "text": - bark_model = bark_model["model"] - - if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params(): - raise ValueError("initial and new models don't have the same number of parameters") - - # check if same output as the bark model - batch_size = 5 - sequence_length = 10 - - if model_type in ["text", "coarse"]: - vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int) - output_old_model = bark_model(vec)[0] - - output_new_model_total = model(vec) - - # take last logits - output_new_model = output_new_model_total.logits[:, [-1], :] - - else: - prediction_codeboook_channel = 3 - n_codes_total = 8 - vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int) - - output_new_model_total = model(prediction_codeboook_channel, vec) - output_old_model = bark_model(prediction_codeboook_channel, vec) - - output_new_model = output_new_model_total.logits - - # output difference should come from the difference of self-attention implementation design - if output_new_model.shape != output_old_model.shape: - raise ValueError("initial and new outputs don't have the same shape") - if (output_new_model - output_old_model).abs().max().item() > 1e-3: - raise ValueError("initial and new outputs are not equal") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -def load_whole_bark_model( - semantic_path, - coarse_path, - fine_path, - append_text, - hub_path, - folder_path, -): - pytorch_dump_folder_path = os.path.join(folder_path, append_text) - - semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json")) - coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json")) - fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json")) - codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz") - - semantic = BarkSemanticModel.from_pretrained(semantic_path) - coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path) - fineAcoustic = BarkFineModel.from_pretrained(fine_path) - codec = EncodecModel.from_pretrained("facebook/encodec_24khz") - - bark_config = BarkConfig.from_sub_model_configs( - semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig - ) - - bark_generation_config = BarkGenerationConfig.from_sub_model_configs( - semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config - ) - - bark = BarkModel(bark_config) - - bark.semantic = semantic - bark.coarse_acoustics = coarseAcoustic - bark.fine_acoustics = fineAcoustic - bark.codec_model = codec - - bark.generation_config = bark_generation_config - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - - parser.add_argument("model_type", type=str, help="text, coarse or fine.") - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.") - - args = parser.parse_args() - - load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small) diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index e694d96ca0df..000000000000 --- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BART checkpoint.""" - -import argparse -import os -from pathlib import Path - -import fairseq -import torch -from packaging import version -from torch import nn - -from transformers import ( - BartConfig, - BartForConditionalGeneration, - BartForSequenceClassification, - BartModel, - BartTokenizer, -) -from transformers.utils import logging - - -FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] -extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} -if version.parse(fairseq.__version__) < version.parse("0.9.0"): - raise Exception("requires fairseq >= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = " Hello world! cĂ©cĂ© herlolip" - -mnli_rename_keys = [ - ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), - ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), - ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), - ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), -] - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "encoder.version", - "decoder.version", - "model.encoder.version", - "model.decoder.version", - "_float_tensor", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def load_xsum_checkpoint(checkpoint_path): - """Checkpoint path should end in model.pt""" - sd = torch.load(checkpoint_path, map_location="cpu") - hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() - hub_interface.model.load_state_dict(sd["model"]) - return hub_interface - - -def make_linear_from_emb(emb): - vocab_size, emb_size = emb.weight.shape - lin_layer = nn.Linear(vocab_size, emb_size, bias=False) - lin_layer.weight.data = emb.weight.data - return lin_layer - - -@torch.no_grad() -def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - if not os.path.exists(checkpoint_path): - bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() - else: - bart = load_xsum_checkpoint(checkpoint_path) - - bart.model.upgrade_state_dict(bart.model.state_dict()) - if hf_checkpoint_name is None: - hf_checkpoint_name = checkpoint_path.replace(".", "-") - config = BartConfig.from_pretrained(hf_checkpoint_name) - tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) - tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) - if not torch.eq(tokens, tokens2).all(): - raise ValueError( - f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}" - ) - - if checkpoint_path == "bart.large.mnli": - state_dict = bart.state_dict() - remove_ignore_keys_(state_dict) - state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] - for src, dest in mnli_rename_keys: - rename_key(state_dict, src, dest) - model = BartForSequenceClassification(config).eval() - model.load_state_dict(state_dict) - fairseq_output = bart.predict("mnli", tokens, return_logits=True) - new_model_outputs = model(tokens)[0] # logits - else: # no classification heads to worry about - state_dict = bart.model.state_dict() - remove_ignore_keys_(state_dict) - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - fairseq_output = bart.extract_features(tokens) - if hf_checkpoint_name == "facebook/bart-large": - model = BartModel(config).eval() - model.load_state_dict(state_dict) - new_model_outputs = model(tokens).model[0] - else: - model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt - model.model.load_state_dict(state_dict) - if hasattr(model, "lm_head"): - model.lm_head = make_linear_from_emb(model.model.shared) - new_model_outputs = model.model(tokens)[0] - - # Check results - if fairseq_output.shape != new_model_outputs.shape: - raise ValueError( - f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}" - ) - if (fairseq_output != new_model_outputs).any().item(): - raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." - ) - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" - ) - args = parser.parse_args() - convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py deleted file mode 100644 index 46c72a97f495..000000000000 --- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py +++ /dev/null @@ -1,373 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BEiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from datasets import load_dataset -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - BeitConfig, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitImageProcessor, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - "beit.encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - "beit.encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - config = BeitConfig() - has_lm_head = False - is_semantic = False - repo_id = "huggingface/label-files" - # set config parameters based on URL - if checkpoint_url[-9:-4] == "pt22k": - # masked image modeling - config.use_shared_relative_position_bias = True - config.use_mask_token = True - has_lm_head = True - elif checkpoint_url[-9:-4] == "ft22k": - # intermediate fine-tuning on ImageNet-22k - config.use_relative_position_bias = True - config.num_labels = 21841 - filename = "imagenet-22k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - elif checkpoint_url[-8:-4] == "to1k": - # fine-tuning on ImageNet-1k - config.use_relative_position_bias = True - config.num_labels = 1000 - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - if "384" in checkpoint_url: - config.image_size = 384 - if "512" in checkpoint_url: - config.image_size = 512 - elif "ade20k" in checkpoint_url: - # fine-tuning - config.use_relative_position_bias = True - config.num_labels = 150 - filename = "ade20k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.image_size = 640 - is_semantic = True - else: - raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'") - - # size of the architecture - if "base" in checkpoint_url: - pass - elif "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - if "ade20k" in checkpoint_url: - config.image_size = 640 - config.out_indices = [7, 11, 15, 23] - else: - raise ValueError("Should either find 'base' or 'large' in checkpoint URL") - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True) - state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic) - if is_semantic: - # add prefix to decoder keys - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("backbone.fpn"): - key = key.replace("backbone.fpn", "fpn") - state_dict[key] = val - - # load HuggingFace model - if checkpoint_url[-9:-4] == "pt22k": - model = BeitForMaskedImageModeling(config) - elif "ade20k" in checkpoint_url: - model = BeitForSemanticSegmentation(config) - else: - model = BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - if is_semantic: - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) - image = Image.open(ds[0]["file"]) - else: - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = torch.Size([1, 1000]) - if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([2.2288, 2.4671, 0.7395]) - expected_class_idx = 2397 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([1.6881, -0.2787, 0.5901]) - expected_class_idx = 2396 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.1241, 0.0798, -0.6569]) - expected_class_idx = 285 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108]) - expected_class_idx = 281 - elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.4610, -0.0928, 0.2086]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]], - [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]], - [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]], - ] - ) - elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]], - [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]], - [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]], - ] - ) - else: - raise ValueError("Can't verify logits as model is not supported") - - if logits.shape != expected_shape: - raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}") - if not has_lm_head: - if is_semantic: - if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - else: - print("Predicted class idx:", logits.argmax(-1).item()) - - if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - if logits.argmax(-1).item() != expected_class_idx: - raise ValueError("Predicted class index not as expected") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index 9dfd8da474e3..000000000000 --- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now -deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert - -TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert -weight names to the original names, so the model can be imported with Huggingface/transformer. - -You may adapt this script to include classification/MLM/NSP/etc. heads. - -Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0). - Models trained with never versions are not compatible with this script. -""" - -import argparse -import os -import re - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info(f"Converting TensorFlow checkpoint from {tf_path}") - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - layer_depth = [] - for full_name, shape in init_vars: - # logger.info(f"Loading TF weight {name} with shape {shape}") - name = full_name.split("/") - if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]: - logger.info(f"Skipping non-model layer {full_name}") - continue - if "optimizer" in full_name: - logger.info(f"Skipping optimization layer {full_name}") - continue - if name[0] == "model": - # ignore initial 'model' - name = name[1:] - # figure out how many levels deep the name is - depth = 0 - for _name in name: - if _name.startswith("layer_with_weights"): - depth += 1 - else: - break - layer_depth.append(depth) - # read data - array = tf.train.load_variable(tf_path, full_name) - names.append("/".join(name)) - arrays.append(array) - logger.info(f"Read a total of {len(arrays):,} layers") - - # Sanity check - if len(set(layer_depth)) != 1: - raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})") - layer_depth = list(set(layer_depth))[0] - if layer_depth != 1: - raise ValueError( - "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP" - " heads." - ) - - # convert layers - logger.info("Converting weights...") - for full_name, array in zip(names, arrays): - name = full_name.split("/") - pointer = model - trace = [] - for i, m_name in enumerate(name): - if m_name == ".ATTRIBUTES": - # variable names end with .ATTRIBUTES/VARIABLE_VALUE - break - if m_name.startswith("layer_with_weights"): - layer_num = int(m_name.split("-")[-1]) - if layer_num <= 2: - # embedding layers - # layer_num 0: word_embeddings - # layer_num 1: position_embeddings - # layer_num 2: token_type_embeddings - continue - elif layer_num == 3: - # embedding LayerNorm - trace.extend(["embeddings", "LayerNorm"]) - pointer = getattr(pointer, "embeddings") - pointer = getattr(pointer, "LayerNorm") - elif layer_num > 3 and layer_num < config.num_hidden_layers + 4: - # encoder layers - trace.extend(["encoder", "layer", str(layer_num - 4)]) - pointer = getattr(pointer, "encoder") - pointer = getattr(pointer, "layer") - pointer = pointer[layer_num - 4] - elif layer_num == config.num_hidden_layers + 4: - # pooler layer - trace.extend(["pooler", "dense"]) - pointer = getattr(pointer, "pooler") - pointer = getattr(pointer, "dense") - elif m_name == "embeddings": - trace.append("embeddings") - pointer = getattr(pointer, "embeddings") - if layer_num == 0: - trace.append("word_embeddings") - pointer = getattr(pointer, "word_embeddings") - elif layer_num == 1: - trace.append("position_embeddings") - pointer = getattr(pointer, "position_embeddings") - elif layer_num == 2: - trace.append("token_type_embeddings") - pointer = getattr(pointer, "token_type_embeddings") - else: - raise ValueError(f"Unknown embedding layer with name {full_name}") - trace.append("weight") - pointer = getattr(pointer, "weight") - elif m_name == "_attention_layer": - # self-attention layer - trace.extend(["attention", "self"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "self") - elif m_name == "_attention_layer_norm": - # output attention norm - trace.extend(["attention", "output", "LayerNorm"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_attention_output_dense": - # output attention dense - trace.extend(["attention", "output", "dense"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_dense": - # output dense - trace.extend(["output", "dense"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output dense - trace.extend(["output", "LayerNorm"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_key_dense": - # attention key - trace.append("key") - pointer = getattr(pointer, "key") - elif m_name == "_query_dense": - # attention query - trace.append("query") - pointer = getattr(pointer, "query") - elif m_name == "_value_dense": - # attention value - trace.append("value") - pointer = getattr(pointer, "value") - elif m_name == "_intermediate_dense": - # attention intermediate dense - trace.extend(["intermediate", "dense"]) - pointer = getattr(pointer, "intermediate") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output layer norm - trace.append("output") - pointer = getattr(pointer, "output") - # weights & biases - elif m_name in ["bias", "beta"]: - trace.append("bias") - pointer = getattr(pointer, "bias") - elif m_name in ["kernel", "gamma"]: - trace.append("weight") - pointer = getattr(pointer, "weight") - else: - logger.warning(f"Ignored {m_name}") - # for certain layers reshape is necessary - trace = ".".join(trace) - if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match( - r"(\S+)\.attention\.output\.dense\.weight", trace - ): - array = array.reshape(pointer.data.shape) - if "kernel" in full_name: - array = array.transpose() - if pointer.shape == array.shape: - pointer.data = torch.from_numpy(array) - else: - raise ValueError( - f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:" - f" {array.shape}" - ) - logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}") - return model - - -def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path): - # Instantiate model - logger.info(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertModel(config) - - # Load weights from checkpoint - logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...") - load_tf2_weights_in_bert(model, tf_checkpoint_path, config) - - # Save pytorch-model - logger.info(f"Saving PyTorch model to {pytorch_dump_path}...") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model (must include filename).", - ) - args = parser.parse_args() - convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index be904ddd7e6c..000000000000 --- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BERT checkpoint.""" - -import argparse - -import torch - -from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = BertConfig.from_json_file(bert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = BertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_bert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py deleted file mode 100644 index f7cb149053a3..000000000000 --- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" - -import argparse -import os - -import numpy as np -import tensorflow as tf -import torch - -from transformers import BertModel - - -def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): - """ - Args: - model: BertModel Pytorch model instance to be converted - ckpt_dir: Tensorflow model directory - model_name: model name - - Currently supported HF models: - - - Y BertModel - - N BertForMaskedLM - - N BertForPreTraining - - N BertForMultipleChoice - - N BertForNextSentencePrediction - - N BertForSequenceClassification - - N BertForQuestionAnswering - """ - - tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") - - var_map = ( - ("layer.", "layer_"), - ("word_embeddings.weight", "word_embeddings"), - ("position_embeddings.weight", "position_embeddings"), - ("token_type_embeddings.weight", "token_type_embeddings"), - (".", "/"), - ("LayerNorm/weight", "LayerNorm/gamma"), - ("LayerNorm/bias", "LayerNorm/beta"), - ("weight", "kernel"), - ) - - if not os.path.isdir(ckpt_dir): - os.makedirs(ckpt_dir) - - state_dict = model.state_dict() - - def to_tf_var_name(name: str): - for patt, repl in iter(var_map): - name = name.replace(patt, repl) - return f"bert/{name}" - - def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): - tf_dtype = tf.dtypes.as_dtype(tensor.dtype) - tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) - session.run(tf.variables_initializer([tf_var])) - session.run(tf_var) - return tf_var - - tf.reset_default_graph() - with tf.Session() as session: - for var_name in state_dict: - tf_name = to_tf_var_name(var_name) - torch_tensor = state_dict[var_name].numpy() - if any(x in var_name for x in tensors_to_transpose): - torch_tensor = torch_tensor.T - tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) - tf_var.assign(tf.cast(torch_tensor, tf_var.dtype)) - tf_weight = session.run(tf_var) - print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") - - saver = tf.train.Saver(tf.trainable_variables()) - saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) - - -def main(raw_args=None): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased") - parser.add_argument( - "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" - ) - parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") - parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") - args = parser.parse_args(raw_args) - - model = BertModel.from_pretrained( - pretrained_model_name_or_path=args.model_name, - state_dict=torch.load(args.pytorch_model_path), - cache_dir=args.cache_dir, - ) - - convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index cba1e1a2c3f7..000000000000 --- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT -model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository: - -https://github.com/tensorflow/models/tree/master/official/projects/token_dropping -""" - -import argparse - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertForMaskedLM -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertPooler, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str): - def get_masked_lm_array(name: str): - full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_array(name: str): - full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_layer_array(layer_index: int, name: str): - full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape): - full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - array = array.reshape(orginal_shape) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - print(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertForMaskedLM(config) - - # Layers - for layer_index in range(0, config.num_hidden_layers): - layer: BertLayer = model.bert.encoder.layer[layer_index] - - # Self-attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.query.weight.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape - ) - self_attn.query.bias.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/bias", self_attn.query.bias.data.shape - ) - self_attn.key.weight.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape - ) - self_attn.key.bias.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/bias", self_attn.key.bias.data.shape - ) - self_attn.value.weight.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape - ) - self_attn.value.bias.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/bias", self_attn.value.bias.data.shape - ) - - # Self-attention Output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.weight.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape - ) - self_output.dense.bias.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/bias", self_output.dense.bias.data.shape - ) - - self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma") - self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta") - - # Intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel") - intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias") - - # Output - bert_output: BertOutput = layer.output - - bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel") - bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias") - - bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma") - bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta") - - # Embeddings - model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings") - model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings") - model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma") - model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta") - - # LM Head - lm_head = model.cls.predictions.transform - - lm_head.dense.weight.data = get_masked_lm_array("dense/kernel") - lm_head.dense.bias.data = get_masked_lm_array("dense/bias") - - lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma") - lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta") - - model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table") - - # Pooling - model.bert.pooler = BertPooler(config=config) - model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel") - model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias") - - # Export final model - model.save_pretrained(pytorch_dump_path) - - # Integration test - should load without any errors ;) - new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path) - print(new_model.eval()) - - print("Model conversion was done sucessfully!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 0b8e6590f937..000000000000 --- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigBird checkpoint.""" - -import argparse - -from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa): - # Initialise PyTorch model - config = BigBirdConfig.from_json_file(big_bird_config_file) - print(f"Building PyTorch model from configuration: {config}") - - if is_trivia_qa: - model = BigBirdForQuestionAnswering(config) - else: - model = BigBirdForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--big_bird_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa - ) diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py deleted file mode 100644 index e17369e48041..000000000000 --- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from typing import Dict - -import tensorflow as tf -import torch -from tqdm import tqdm - -from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration - - -INIT_COMMON = [ - # tf -> hf - ("/", "."), - ("layer_", "layers."), - ("kernel", "weight"), - ("beta", "bias"), - ("gamma", "weight"), - ("pegasus", "model"), -] -END_COMMON = [ - (".output.dense", ".fc2"), - ("intermediate.LayerNorm", "final_layer_norm"), - ("intermediate.dense", "fc1"), -] - -DECODER_PATTERNS = ( - INIT_COMMON - + [ - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.out_proj"), - ("attention.self", "self_attn"), - ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"), - ("attention.encdec_output.dense", "encoder_attn.out_proj"), - ("attention.encdec", "encoder_attn"), - ("key", "k_proj"), - ("value", "v_proj"), - ("query", "q_proj"), - ("decoder.LayerNorm", "decoder.layernorm_embedding"), - ] - + END_COMMON -) - -REMAINING_PATTERNS = ( - INIT_COMMON - + [ - ("embeddings.word_embeddings", "shared.weight"), - ("embeddings.position_embeddings", "embed_positions.weight"), - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.output"), - ("attention.self", "self_attn.self"), - ("encoder.LayerNorm", "encoder.layernorm_embedding"), - ] - + END_COMMON -) - -KEYS_TO_IGNORE = [ - "encdec/key/bias", - "encdec/query/bias", - "encdec/value/bias", - "self/key/bias", - "self/query/bias", - "self/value/bias", - "encdec_output/dense/bias", - "attention/output/dense/bias", -] - - -def rename_state_dict_key(k, patterns): - for tf_name, hf_name in patterns: - k = k.replace(tf_name, hf_name) - return k - - -def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration: - cfg = BigBirdPegasusConfig(**config_update) - torch_model = BigBirdPegasusForConditionalGeneration(cfg) - state_dict = torch_model.state_dict() - mapping = {} - - # separating decoder weights - decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")} - remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")} - - for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = DECODER_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict: - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(True if i in k else False for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = REMAINING_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings": - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(True if i in k else False for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - if k != "pegasus/embeddings/position_embeddings": - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"] - mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight") - missing, extra = torch_model.load_state_dict(mapping, strict=False) - unexpected_missing = [ - k - for k in missing - if k - not in [ - "final_logits_bias", - "model.encoder.embed_tokens.weight", - "model.decoder.embed_tokens.weight", - "lm_head.weight", - ] - ] - assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" - assert extra == [], f"no matches found for the following tf keys {extra}" - return torch_model - - -def get_tf_weights_as_numpy(path) -> Dict: - init_vars = tf.train.list_variables(path) - tf_weights = {} - ignore_name = ["global_step"] - for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"): - skip_key = any(pat in name for pat in ignore_name) - if skip_key: - continue - array = tf.train.load_variable(path, name) - tf_weights[name] = array - return tf_weights - - -def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict): - tf_weights = get_tf_weights_as_numpy(ckpt_path) - torch_model = convert_bigbird_pegasus(tf_weights, config_update) - torch_model.save_pretrained(save_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables") - parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.") - args = parser.parse_args() - config_update = {} - convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update) diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index c930a850462c..000000000000 --- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,292 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import json -import os -import re -import shutil - -import torch - -from transformers import BioGptConfig, BioGptForCausalLM -from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - - -# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18 -class Dictionary: - """A mapping from symbols to consecutive integers""" - - def __init__( - self, - *, # begin keyword-only arguments - bos="", - pad="", - eos="", - unk="", - extra_special_symbols=None, - ): - self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos - self.symbols = [] - self.count = [] - self.indices = {} - self.bos_index = self.add_symbol(bos) - self.pad_index = self.add_symbol(pad) - self.eos_index = self.add_symbol(eos) - self.unk_index = self.add_symbol(unk) - if extra_special_symbols: - for s in extra_special_symbols: - self.add_symbol(s) - self.nspecial = len(self.symbols) - - def __eq__(self, other): - return self.indices == other.indices - - def __getitem__(self, idx): - if idx < len(self.symbols): - return self.symbols[idx] - return self.unk_word - - def __len__(self): - """Returns the number of symbols in the dictionary""" - return len(self.symbols) - - def __contains__(self, sym): - return sym in self.indices - - @classmethod - def load(cls, f): - """Loads the dictionary from a text file with the format: - - ``` - - - ... - ``` - """ - d = cls() - d.add_from_file(f) - return d - - def add_symbol(self, word, n=1, overwrite=False): - """Adds a word to the dictionary""" - if word in self.indices and not overwrite: - idx = self.indices[word] - self.count[idx] = self.count[idx] + n - return idx - else: - idx = len(self.symbols) - self.indices[word] = idx - self.symbols.append(word) - self.count.append(n) - return idx - - def _load_meta(self, lines): - return 0 - - def add_from_file(self, f): - """ - Loads a pre-existing dictionary from a text file and adds its symbols to this instance. - """ - if isinstance(f, str): - try: - with open(f, "r", encoding="utf-8") as fd: - self.add_from_file(fd) - except FileNotFoundError as fnfe: - raise fnfe - except UnicodeError: - raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f)) - return - - lines = f.readlines() - indices_start_line = self._load_meta(lines) - - for line in lines[indices_start_line:]: - try: - line, field = line.rstrip().rsplit(" ", 1) - if field == "#fairseq:overwrite": - overwrite = True - line, field = line.rsplit(" ", 1) - else: - overwrite = False - count = int(field) - word = line - if word in self and not overwrite: - raise RuntimeError( - "Duplicate word found when loading Dictionary: '{}'. " - "Duplicate words can overwrite earlier ones by adding the " - "#fairseq:overwrite flag at the end of the corresponding row " - "in the dictionary file. If using the Camembert model, please " - "download an updated copy of the model file.".format(word) - ) - self.add_symbol(word, n=count, overwrite=overwrite) - except ValueError: - raise ValueError("Incorrect dictionary format, expected ' [flags]'") - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = " ".split() - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path): - # prep - if not os.path.exists(biogpt_checkpoint_path): - raise ValueError(f"path {biogpt_checkpoint_path} does not exist!") - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt") - if not os.path.isfile(checkpoint_file): - raise ValueError(f"path to the file {checkpoint_file} does not exist!") - chkpt = torch.load(checkpoint_file, map_location="cpu") - - args = chkpt["cfg"]["model"] - - # dicts - dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt") - if not os.path.isfile(dict_file): - raise ValueError(f"path to the file {dict_file} does not exist!") - src_dict = Dictionary.load(dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"]) - print(f"Generating {src_vocab_file} of {src_vocab_size} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes") - if not os.path.isfile(bpecodes_file): - raise ValueError(f"path to the file {bpecodes_file} does not exist!") - - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - shutil.copyfile(bpecodes_file, merges_file) - - # model config - biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - model_conf = { - "activation_dropout": args["activation_dropout"], - "architectures": ["BioGptForCausalLM"], - "attention_probs_dropout_prob": args["attention_dropout"], - "bos_token_id": 0, - "eos_token_id": 2, - "hidden_act": args["activation_fn"], - "hidden_dropout_prob": args["dropout"], - "hidden_size": args["decoder_embed_dim"], - "initializer_range": 0.02, - "intermediate_size": args["decoder_ffn_embed_dim"], - "layer_norm_eps": 1e-12, - "layerdrop": args["decoder_layerdrop"], - "max_position_embeddings": args["max_target_positions"], - "model_type": "biogpt", - "num_attention_heads": args["decoder_attention_heads"], - "num_hidden_layers": args["decoder_layers"], - "pad_token_id": 1, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_decoder_input_output_embed"], - "vocab_size": src_vocab_size, - } - - # good hparam defaults to start with - - print(f"Generating {biogpt_model_config_file}") - with open(biogpt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "bos_token": "", - "eos_token": "", - "model_max_length": 1024, - "pad_token": "", - "special_tokens_map_file": None, - "tokenizer_class": "BioGptTokenizer", - "unk_token": "", - } - - print(f"Generating {biogpt_tokenizer_config_file}") - with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model_state_dict = chkpt["model"] - - # remove unneeded keys - ignore_keys = [ - "decoder.version", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - layer_names = list(model_state_dict.keys()) - for layer_name in layer_names: - if layer_name.endswith("output_projection.weight"): - model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name) - else: - model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name) - - config = BioGptConfig.from_pretrained(pytorch_dump_folder_path) - model_new = BioGptForCausalLM(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--biogpt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py deleted file mode 100644 index abc24290ab26..000000000000 --- a/src/transformers/models/bit/convert_bit_to_pytorch.py +++ /dev/null @@ -1,177 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BiT checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm import create_model -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import BitConfig, BitForImageClassification, BitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_config(model_name): - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - conv_layer = "std_conv" if "bit" in model_name else False - - # note that when using BiT as backbone for ViT-hybrid checkpoints, - # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same", - # config.conv_layer = "std_conv_same" - config = BitConfig( - conv_layer=conv_layer, - num_labels=1000, - id2label=id2label, - label2id=label2id, - ) - - return config - - -def rename_key(name): - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "head.fc" in name: - name = name.replace("head.fc", "classifier.1") - if name.startswith("norm"): - name = "bit." + name - if "bit" not in name and "classifier" not in name: - name = "bit.encoder." + name - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BiT structure. - """ - - # define default BiT configuration - config = get_config(model_name) - - # load original model from timm - timm_model = create_model(model_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model - state_dict = timm_model.state_dict() - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val.squeeze() if "head" in key else val - - # load HuggingFace model - model = BitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Logits:", logits[0, :3]) - print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model {model_name} and processor to the hub") - model.push_to_hub(f"ybelkada/{model_name}") - processor.push_to_hub(f"ybelkada/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="resnetv2_50x1_bitm", - type=str, - help="Name of the BiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index c5919b94d42f..000000000000 --- a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,114 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Blenderbot checkpoint.""" - -import argparse - -import torch - -from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -PATTERNS = [ - ["attention", "attn"], - ["encoder_attention", "encoder_attn"], - ["q_lin", "q_proj"], - ["k_lin", "k_proj"], - ["v_lin", "v_proj"], - ["out_lin", "out_proj"], - ["norm_embeddings", "layernorm_embedding"], - ["position_embeddings", "embed_positions"], - ["embeddings", "embed_tokens"], - ["ffn.lin", "fc"], -] - - -def rename_state_dict_key(k): - if k == "embeddings.weight": - return "shared.weight" - - for parlai_name, hf_name in PATTERNS: - k = k.replace(parlai_name, hf_name) - - if k.startswith("encoder"): - k = k.replace(".attn", ".self_attn") - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "final_layer_norm") - elif k.startswith("decoder"): - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "encoder_attn_layer_norm") - k = k.replace("norm3", "final_layer_norm") - return k - - -def rename_layernorm_keys(sd): - keys = [ - "model.encoder.layernorm_embedding.weight", - "model.encoder.layernorm_embedding.bias", - "model.decoder.layernorm_embedding.weight", - "model.decoder.layernorm_embedding.bias", - ] - for k in keys: - v = sd.pop(k) - new_k = k.replace("layernorm_embedding", "layer_norm") - assert new_k not in sd - sd[new_k] = v - - -IGNORE_KEYS = ["START"] - - -@torch.no_grad() -def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - model = torch.load(checkpoint_path, map_location="cpu") - sd = model["model"] - cfg = BlenderbotConfig.from_json_file(config_json_path) - m = BlenderbotForConditionalGeneration(cfg) - valid_keys = m.model.state_dict().keys() - failures = [] - mapping = {} - for k, v in sd.items(): - if k in IGNORE_KEYS: - continue - - new_k = rename_state_dict_key(k) - if new_k not in valid_keys: - failures.append([k, new_k]) - else: - mapping[new_k] = v - if cfg.normalize_before: # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm - rename_layernorm_keys(sd) - m.model.load_state_dict(mapping, strict=True) - m.half() - m.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin") - parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.") - parser.add_argument( - "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use" - ) - args = parser.parse_args() - convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json) diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py deleted file mode 100644 index 3de18c294ae8..000000000000 --- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -import requests -import torch - -# git clone https://github.com/salesforce/BLIP.git -from models.blip import blip_decoder -from models.blip_itm import blip_itm -from models.blip_vqa import blip_vqa -from PIL import Image -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode - -from transformers import ( - BertTokenizer, - BlipConfig, - BlipForConditionalGeneration, - BlipForImageTextRetrieval, - BlipForQuestionAnswering, -) - - -def load_demo_image(image_size, device): - img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" - raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") - - transform = transforms.Compose( - [ - transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - image = transform(raw_image).unsqueeze(0).to(device) - return image - - -def rename_key(key): - if "visual_encoder" in key: - key = re.sub("visual_encoder*", "vision_model.encoder", key) - if "blocks" in key: - key = re.sub(r"blocks", "layers", key) - if "attn" in key: - key = re.sub(r"attn", "self_attn", key) - if "norm1" in key: - key = re.sub(r"norm1", "layer_norm1", key) - if "norm2" in key: - key = re.sub(r"norm2", "layer_norm2", key) - if "encoder.norm" in key: - key = re.sub(r"encoder.norm", "post_layernorm", key) - if "encoder.patch_embed.proj" in key: - key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key) - - if "encoder.pos_embed" in key: - key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key) - if "encoder.cls_token" in key: - key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key) - - if "self_attn" in key: - key = re.sub(r"self_attn.proj", "self_attn.projection", key) - - return key - - -@torch.no_grad() -def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = BlipConfig.from_pretrained(config_path) - else: - config = BlipConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = BlipForConditionalGeneration(config).eval() - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" - - pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base") - pt_model = pt_model.eval() - - modified_state_dict = pt_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_model.load_state_dict(modified_state_dict) - - image_size = 384 - image = load_demo_image(image_size=image_size, device="cpu") - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - input_ids = tokenizer(["a picture of"]).input_ids - - out = hf_model.generate(image, input_ids) - - assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - out = hf_model.generate(image) - - assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - if pytorch_dump_folder_path is not None: - hf_model.save_pretrained(pytorch_dump_folder_path) - - # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth' - model_url = ( - "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" - ) - - vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base") - vqa_model.eval() - - modified_state_dict = vqa_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_vqa_model = BlipForQuestionAnswering(config) - - hf_vqa_model.load_state_dict(modified_state_dict) - - question = ["How many dogs are in this image?"] - question_input_ids = tokenizer(question, return_tensors="pt").input_ids - - answer = hf_vqa_model.generate(question_input_ids, image) - print(tokenizer.decode(answer[0])) - - assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]" - if pytorch_dump_folder_path is not None: - hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa") - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" - - itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base") - itm_model.eval() - - modified_state_dict = itm_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_itm_model = BlipForImageTextRetrieval(config) - - question = ["A picture of a woman with a dog sitting in a beach"] - question_input_ids = tokenizer( - question, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=35, - ).input_ids - - hf_itm_model.load_state_dict(modified_state_dict) - hf_itm_model.eval() - - out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True) - out = hf_itm_model(question_input_ids, image, use_itm_head=False) - - assert out[0].item() == 0.2110687494277954 - assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127 - - if pytorch_dump_folder_path is not None: - hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py deleted file mode 100644 index d6640045b80c..000000000000 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ /dev/null @@ -1,390 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert BLIP-2 checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2 -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32 -# to make sure we can compare both original and HF implementation in float32 -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BertTokenizer, - Blip2Config, - Blip2ForConditionalGeneration, - Blip2ForImageTextRetrieval, - Blip2Processor, - Blip2QFormerConfig, - Blip2VisionConfig, - BlipImageProcessor, - OPTConfig, - T5Config, - set_seed, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, model_name): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) - if "itm" in model_name: - rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight")) - rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight")) - rename_keys.append(("vision_proj.weight", "vision_projection.weight")) - rename_keys.append(("vision_proj.bias", "vision_projection.bias")) - rename_keys.append(("text_proj.weight", "text_projection.weight")) - rename_keys.append(("text_proj.bias", "text_projection.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name, eos_token_id): - image_size = 364 if "coco" in model_name else 224 - vision_config = Blip2VisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "opt-2.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict() - elif "opt-6.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict() - elif "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "itm" in model_name: - text_config = {} - else: - raise ValueError("Model name not supported") - - if "itm" in model_name: - config = Blip2Config( - vision_config=vision_config, - qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(), - ) - else: - config = Blip2Config(vision_config=vision_config, text_config=text_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint( - model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu" -): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - if "opt" in model_name: - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b") - elif "itm" in model_name: - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") - tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - else: - tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") - - if "itm" in model_name: - eos_token_id = None - else: - eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] - config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) - - if "itm" in model_name: - hf_model = Blip2ForImageTextRetrieval(config).eval() - else: - hf_model = Blip2ForConditionalGeneration(config).eval() - - model_name_to_original = { - "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), - "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"), - "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"), - "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"), - "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), - "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), - "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), - "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"), - "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config, model_name) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "opt_proj" in key: - key = key.replace("opt_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("opt"): - key = key.replace("opt", "language") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - assert len(missing_keys) == 0 - - if "itm" in model_name: - unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys)) - assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"] - else: - assert unexpected_keys == ["qformer.embeddings.position_ids"] - - image = load_demo_image() - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer) - pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device) - - # make sure processor creates exact same pixel values - assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device)) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - - if "itm" in model_name: - caption = "a large fountain spewing water into the air" - input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device) - attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device) - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=True, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - - original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1) - itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1) - assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4) - print("Looks ok!") - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=False, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - print("Looks ok!") - - else: - input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) - - with torch.no_grad(): - if "opt" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits - logits = hf_model(pixel_values, input_ids).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} - ).logits - labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(pixel_values, input_ids, labels=labels).logits - - assert original_logits.shape == logits.shape - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) - print("Looks ok!") - - print("Generating a caption...") - prompt = "Question: what object is in this image? Answer:" - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) - - set_seed(42) - - original_outputs = original_model.generate( - {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50 - ) - outputs = hf_model.generate( - pixel_values, - input_ids, - do_sample=True, - num_beams=5, - max_length=30, - min_length=1, - top_p=0.9, - repetition_penalty=1.0, - length_penalty=1.0, - temperature=1, - ) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("Original generation:", original_outputs) - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"nielsr/{model_name}") - hf_model.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "blip2-opt-2.7b", - "blip2-opt-6.7b", - "blip2-opt-2.7b-coco", - "blip2-opt-6.7b-coco", - "blip2-flan-t5-xl", - "blip2-flan-t5-xl-coco", - "blip2-flan-t5-xxl", - "blip2-itm-vit-g", - "blip2-itm-vit-g-coco", - ] - parser.add_argument( - "--model_name", - default="blip2-opt-2.7b", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - # note: this script is tested on 2 GPUs, as models are compared in float32, - # which requires quite some memory. Hence loading both on a - # separate device is the easiest to compare - parser.add_argument( - "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - parser.add_argument( - "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - - args = parser.parse_args() - - convert_blip2_checkpoint( - args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device - ) diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py deleted file mode 100644 index 40ba6240d3e4..000000000000 --- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,254 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigScience BLOOM checkpoint.""" - -import argparse -import json -import os -import re - -import torch - -from transformers import BloomConfig, BloomModel -from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME -from transformers.utils import logging - - -logging.set_verbosity_info() - -WEIGHTS_TO_AVERAGE_ENDSWITH = [ - "word_embeddings_layernorm.weight", - "word_embeddings_layernorm.bias", - "input_layernorm.weight", - "input_layernorm.bias", - "post_attention_layernorm.weight", - "post_attention_layernorm.bias", - "self_attention.dense.bias", - "mlp.dense_4h_to_h.bias", - "ln_f.weight", - "ln_f.bias", -] - -WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ - "mlp.dense_4h_to_h.weight", - "self_attention.dense.weight", -] - - -def layer_name_mapping(key, file): - """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only""" - # Handle first and last layers - layer_rename_map = { - "word_embeddings.weight": "word_embeddings.weight", - "word_embeddings.norm.weight": "word_embeddings_layernorm.weight", - "word_embeddings.norm.bias": "word_embeddings_layernorm.bias", - "weight": "ln_f.weight", - "bias": "ln_f.bias", - } - - if key in layer_rename_map: - return layer_rename_map[key] - - # Handle transformer blocks - layer_number = int(re.match(r".*layer_(\d*).*", file)[1]) - layer_number -= 3 - return f"h.{layer_number}." + key - - -def get_dtype_size(dtype): - if dtype == torch.bool: - return 1 / 8 - bit_search = re.search(r"[^\d](\d+)$", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - -def convert_bloom_checkpoint_to_pytorch( - bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp -): - # Construct model - if bloom_config_file == "": - config = BloomConfig() - else: - config = BloomConfig.from_json_file(bloom_config_file) - - if shard_model: - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - index_dict = {"weight_map": {}, "metadata": {}} - total_size = 0 - - missing_keys = None - - config = BloomConfig() - - for j, file in enumerate(file_names): - print("Processing file: {}".format(file)) - tensors = None - - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu") - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights accross TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - torch.save( - tensors, - os.path.join( - pytorch_dump_folder_path, - "pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)), - ), - ) - - for key in tensors.keys(): - value = tensors[key] - total_size += value.numel() * get_dtype_size(value.dtype) - if key not in index_dict["weight_map"]: - index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format( - str(j + 1).zfill(5), str(len(file_names)).zfill(5) - ) - - config = BloomConfig() - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - index_dict["metadata"]["total_size"] = total_size - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f: - json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n" - f.write(json_config) - else: - model = BloomModel(config) - - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - missing_keys = None - for i, file in enumerate(file_names): - tensors = None - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu") - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors.keys(): - # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights accross TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - - other_keys = model.load_state_dict(tensors, strict=False) - assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected" - if missing_keys is None: - missing_keys = set(other_keys.missing_keys) - else: - missing_keys = missing_keys.intersection(set(other_keys.missing_keys)) - - assert not missing_keys, f"The keys {missing_keys} are missing" - - # Save pytorch-model - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.torch_dtype}") - if config.torch_dtype is not None: - model = model.to(config.torch_dtype) - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bloom_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the Megatron-LM checkpoint path.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--bloom_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--shard_model", - action="store_true", - help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint", - ) - parser.add_argument( - "--pretraining_tp", - default=4, - type=int, - help="Pretraining TP rank that has been used when training the model in Megatron-LM \n", - ) - args = parser.parse_args() - convert_bloom_checkpoint_to_pytorch( - args.bloom_checkpoint_path, - args.bloom_config_file, - args.pytorch_dump_folder_path, - args.shard_model, - args.pretraining_tp, - ) diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py deleted file mode 100644 index c0984f2c74b2..000000000000 --- a/src/transformers/models/bros/convert_bros_to_pytorch.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bros checkpoints.""" - -import argparse - -import bros # original repo -import torch - -from transformers import BrosConfig, BrosModel, BrosProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_configs(model_name): - bros_config = BrosConfig.from_pretrained(model_name) - return bros_config - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "embeddings.bbox_sinusoid_emb.inv_freq", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if name == "embeddings.bbox_projection.weight": - name = "bbox_embeddings.bbox_projection.weight" - - if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq" - - if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq" - - return name - - -def convert_state_dict(orig_state_dict, model): - # rename keys - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - orig_state_dict[rename_key(key)] = val - - # remove ignore keys - remove_ignore_keys_(orig_state_dict) - - return orig_state_dict - - -def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = bros.BrosModel.from_pretrained(model_name).eval() - - # load HuggingFace Model - bros_config = get_configs(model_name) - model = BrosModel.from_pretrained(model_name, config=bros_config) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results - - # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape - bbox = torch.tensor( - [ - [ - [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], - [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], - ] - ] - ) - - processor = BrosProcessor.from_pretrained(model_name) - - encoding = processor("His name is Rocco.", return_tensors="pt") - encoding["bbox"] = bbox - - original_hidden_states = original_model(**encoding).last_hidden_state - # pixel_values = processor(image, return_tensors="pt").pixel_values - - last_hidden_states = model(**encoding).last_hidden_state - - assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4) - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--model_name", - default="jinho8345/bros-base-uncased", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub.", - ) - - args = parser.parse_args() - convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 9b1b15857cea..000000000000 --- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The T5 authors and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert T5 checkpoint.""" - -import argparse - -from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5 -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config = T5Config.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = T5ForConditionalGeneration(config) - - # Load weights from tf checkpoint - load_tf_weights_in_t5(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 45dcdb290333..000000000000 --- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CANINE checkpoint.""" - -import argparse - -from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path): - # Initialize PyTorch model - config = CanineConfig() - model = CanineModel(config) - model.eval() - - print(f"Building PyTorch model from configuration: {config}") - - # Load weights from tf checkpoint - load_tf_weights_in_canine(model, config, tf_checkpoint_path) - - # Save pytorch-model (weights and configuration) - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - # Save tokenizer files - tokenizer = CanineTokenizer() - print(f"Save tokenizer files to {pytorch_dump_path}") - tokenizer.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint. Should end with model.ckpt", - ) - parser.add_argument( - "--pytorch_dump_path", - default=None, - type=str, - required=True, - help="Path to a folder where the PyTorch model will be placed.", - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path) diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py deleted file mode 100644 index ff45c9b597e0..000000000000 --- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py +++ /dev/null @@ -1,476 +0,0 @@ -# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os - -import requests -import torch -import yaml -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - ChameleonConfig, - ChameleonForConditionalGeneration, - ChameleonImageProcessor, - ChameleonProcessor, -) - - -try: - from transformers import LlamaTokenizerFast -except ImportError: - raise ValueError( - "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! " - "Update your `tokenizers` library and re-run the tokenizer conversion." - ) - -""" -Sample usage: - -``` -python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \ - --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast - -model = ChameleonForConditionalGeneration.from_pretrained("/output/path") -tokenizer = LlamaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -NUM_SHARDS = { - "7B": 1, - "30B": 4, -} - -VOCAB_SIZE = 65536 - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model(model_path, input_base_path, model_size, chameleon_version=1): - os.makedirs(model_path, exist_ok=True) - input_model_path = os.path.join(input_base_path, "models", model_size.lower()) - params_path = os.path.join(input_model_path, "params.json") - consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json") - - params = read_json(params_path) - if os.path.isfile(consolidate_params_path): - params = {**params, **read_json(consolidate_params_path)} - num_shards = NUM_SHARDS[model_size] - model_parallel_size = params["model_parallel_size"] - params = params.get("model", params) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - swin_norm = params["swin_norm"] - if base > 10000.0: - max_position_embeddings = 16384 - else: - # Depending on the Chameleon version, the default max_position_embeddings has different values. - if chameleon_version == 1: - max_position_embeddings = 4096 - else: - raise NotImplementedError( - f"Version {chameleon_version} of chameleon is not supported yet. " - "Current supported versions of chameleon are [1]." - ) - - if params.get("n_kv_heads", None) is not None: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_local_key_value_heads = n_heads_per_shard // num_key_value_heads - key_value_dim = dim // num_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_local_key_value_heads = n_heads_per_shard - key_value_dim = dim - - print(f"Fetching all parameters from the checkpoint at {input_model_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = None - for possible_name in ["consolidated.pth", "consolidated.00.pth"]: - possible_path = os.path.join(input_model_path, possible_name) - if os.path.exists(possible_path): - loaded = torch.load(possible_path, map_location="cpu") - break - assert loaded is not None - else: - # Sharded - loaded = [ - torch.load(os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu") - for i in range(num_shards) - ] - - # permute for sliced rotary - def permute(w, n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - # Load weights to the state dict - state_dict = {} - for layer_i in range(n_layers): - if num_shards == 1: - # Unsharded - state_dict.update( - { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - ) - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - - else: - # Sharded - state_dict.update( - { - f"model.layers.{layer_i}.input_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded] - ).mean(dim=0), - f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded] - ).mean(dim=0), - } - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(num_shards) - ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim), - n_heads=num_key_value_heads, - dim1=key_value_dim, - ) - - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 - ) - - if num_shards == 1: - # Unsharded - state_dict.update( - { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - ) - else: - state_dict.update( - { - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 - ), - "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), - } - ) - - # Load VQGAN weights - vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt") - vqgan_state_dict = torch.load(vqgan_path, map_location="cpu")["state_dict"] - for k, v in vqgan_state_dict.items(): - if "decoder" in k: - continue # we dont do image generation yet - state_dict[f"model.vqmodel.{k}"] = v - - # Write configs - ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 - multiple_of = params["multiple_of"] if "multiple_of" in params else 256 - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file: - tokenizer_config = json.load(tokenizer_file) - vocabulary_map = tokenizer_config["model"]["vocab"] - vocabulary_map[""] = vocabulary_map[ - "" - ] # use a reserved token instead of adding a new one - del vocabulary_map[""] - - for token in tokenizer_config["added_tokens"]: - if token["content"] == "": - token["content"] = "" - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f: - json.dump(tokenizer_config, f) # save the new file to init tokenizer later - - vq_keys_to_replace = [ - ("ch", "base_channels"), - ("out_ch", "out_channels"), - ("n_embed", "num_embeddings"), - ("ch_mult", "channel_multiplier"), - ("double_z", "double_latent"), - ("z_channels", "latent_channels"), - ] - with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file: - vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"] - vq_config.update(**vq_config["ddconfig"]) - for old, new in vq_keys_to_replace: - vq_config[new] = vq_config[old] - del vq_config["ddconfig"] - del vq_config["ckpt_path"] - del vq_config["lossconfig"] - - config = ChameleonConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=VOCAB_SIZE, - rope_theta=base, - max_position_embeddings=max_position_embeddings, - model_parallel_size=model_parallel_size, - swin_norm=swin_norm, - vq_config=vq_config, - vocabulary_map=vocabulary_map, - ) - with init_empty_weights(): - model = ChameleonForConditionalGeneration(config) - - model.load_state_dict(state_dict, assign=True, strict=False) - model.save_pretrained(model_path, safe_serialization=True) - - # Load and save the processor - tokenizer = LlamaTokenizerFast( - tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False - ) - tokenizer.sep_token_id = 8710 # assign to sep so that we can append it after input text - tokenizer.pad_token_id = 1 # assing to special pad_token - image_processor = ChameleonImageProcessor() - processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - del vqgan_state_dict - gc.collect() - - # Short inference on a few examples to check if generation makes sense - # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl - print("Loading the checkpoint in a Chameleon model...") - print("*" * 100) - model = ChameleonForConditionalGeneration.from_pretrained( - model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto" - ) - processor = ChameleonProcessor.from_pretrained(model_path) - - prompt = "I'm very intrigued by this work of art:Please tell me about the artist." - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - - # Multi-image example - prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." - image = Image.open( - requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw - ) - image_2 = Image.open( - requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw - ) - - inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16) - length = inputs.input_ids.shape[1] - out = model.generate(**inputs, max_new_tokens=50, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for multi-image: {generated_text}") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Chameleon weights", - ) - parser.add_argument( - "--model_size", - choices=["7B", "30B"], - help="" - " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. - parser.add_argument( - "--chameleon_version", - choices=[1], - default=1, - type=int, - help="Version of the Chameleon model to convert", - ) - args = parser.parse_args() - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - chameleon_version=args.chameleon_version, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py deleted file mode 100644 index 02c4b7b754b2..000000000000 --- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch - -from transformers import ChineseCLIPConfig, ChineseCLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_weights, prefix): - q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0) - - out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"] - out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"] - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight.data = out_proj_weights - hf_attn_layer.out_proj.bias.data = out_proj_bias - - -def copy_mlp(hf_mlp, pt_weights, prefix): - copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc") - copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj") - - -def copy_linear(hf_linear, pt_weights, prefix): - hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data - hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data - - -def copy_layer(hf_layer, pt_weights, prefix): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1") - copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2") - - # copy MLP - copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp") - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn") - - -def copy_layers(hf_layers, pt_weights, prefix): - for layer_id, hf_layer in enumerate(hf_layers): - copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}") - - -def copy_text_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T - - # copy text encoder - for name, param in hf_model.text_model.named_parameters(): - param.data = pt_weights[f"bert.{name}"].data - - -def copy_vision_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre") - copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post") - - # copy embeddings - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data - hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks") - - -@torch.no_grad() -def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - - assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size." - config = ChineseCLIPConfig.from_pretrained(config_path) - - hf_model = ChineseCLIPModel(config).eval() - - pt_weights = torch.load(checkpoint_path, map_location="cpu")["state_dict"] - pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()} - - copy_text_model_and_projection(hf_model, pt_weights) - copy_vision_model_and_projection(hf_model, pt_weights) - hf_model.logit_scale.data = pt_weights["logit_scale"].data - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output folder storing converted hf PyTorch model.", - ) - parser.add_argument( - "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint." - ) - parser.add_argument( - "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert." - ) - args = parser.parse_args() - - convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) - print("The conversion is finished!") diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py deleted file mode 100644 index d422bc45ab3d..000000000000 --- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py +++ /dev/null @@ -1,133 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -from laion_clap import CLAP_Module - -from transformers import AutoFeatureExtractor, ClapConfig, ClapModel - - -KEYS_TO_MODIFY_MAPPING = { - "text_branch": "text_model", - "audio_branch": "audio_model.audio_encoder", - "attn": "attention.self", - "self.proj": "output.dense", - "attention.self_mask": "attn_mask", - "mlp.fc1": "intermediate.dense", - "mlp.fc2": "output.dense", - "norm1": "layernorm_before", - "norm2": "layernorm_after", - "bn0": "batch_norm", -} - -processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc") - - -def init_clap(checkpoint_path, model_type, enable_fusion=False): - model = CLAP_Module( - amodel=model_type, - enable_fusion=enable_fusion, - ) - model.load_ckpt(checkpoint_path) - return model - - -def get_config_from_original(clap_model): - audio_config = { - "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim, - "depths": clap_model.model.audio_branch.depths, - "hidden_size": clap_model.model.audio_projection[0].in_features, - } - - text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features} - - return ClapConfig(audio_config=audio_config, text_config=text_config) - - -def rename_state_dict(state_dict): - model_state_dict = {} - - sequential_layers_pattern = r".*sequential.(\d+).*" - text_projection_pattern = r".*_projection.(\d+).*" - - for key, value in state_dict.items(): - # check if any key needs to be modified - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - if re.match(sequential_layers_pattern, key): - # replace sequential layers with list - sequential_layer = re.match(sequential_layers_pattern, key).group(1) - - key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.") - elif re.match(text_projection_pattern, key): - projecton_layer = int(re.match(text_projection_pattern, key).group(1)) - - # Because in CLAP they use `nn.Sequential`... - transformers_projection_layer = 1 if projecton_layer == 0 else 2 - - key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.") - - if "audio" and "qkv" in key: - # split qkv into query key and value - mixed_qkv = value - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - - model_state_dict[key.replace("qkv", "query")] = query_layer - model_state_dict[key.replace("qkv", "key")] = key_layer - model_state_dict[key.replace("qkv", "value")] = value_layer - else: - model_state_dict[key] = value - - return model_state_dict - - -def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False): - clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion) - - clap_model.eval() - state_dict = clap_model.model.state_dict() - state_dict = rename_state_dict(state_dict) - - transformers_config = get_config_from_original(clap_model) - transformers_config.audio_config.enable_fusion = enable_fusion - model = ClapModel(transformers_config) - - # ignore the spectrogram embedding layer - model.load_state_dict(state_dict, strict=False) - - model.save_pretrained(pytorch_dump_folder_path) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not") - parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not") - args = parser.parse_args() - - convert_clap_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion - ) diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py deleted file mode 100644 index 3d88fc1929c3..000000000000 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -from clip import load - -from transformers import CLIPConfig, CLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_attn_layer): - q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0) - - out_proj_weights = pt_attn_layer.out_proj.weight - out_proj_bias = pt_attn_layer.out_proj.bias - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight = out_proj_weights - hf_attn_layer.out_proj.bias = out_proj_bias - - -def copy_mlp(hf_mlp, pt_mlp): - copy_linear(hf_mlp.fc1, pt_mlp.c_fc) - copy_linear(hf_mlp.fc2, pt_mlp.c_proj) - - -def copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - -def copy_layer(hf_layer, pt_layer): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_layer.ln_1) - copy_linear(hf_layer.layer_norm2, pt_layer.ln_2) - - # copy MLP - copy_mlp(hf_layer.mlp, pt_layer.mlp) - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_layer.attn) - - -def copy_layers(hf_layers, pt_layers): - for hf_layer, pt_layer in zip(hf_layers, pt_layers): - copy_layer(hf_layer, pt_layer) - - -def copy_encoder(hf_encoder, pt_model): - # copy embeds - hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight - hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding - - # copy layer norm - copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final) - - # copy hidden layers - copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks) - - -def copy_text_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous() - - # copy text encoder - copy_encoder(hf_model.text_model, pt_model) - - -def copy_vison_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous() - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre) - copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post) - - # copy embeds - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data - hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks) - - -@torch.no_grad() -def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = CLIPConfig.from_pretrained(config_path) - else: - config = CLIPConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = CLIPModel(config).eval() - - pt_model, _ = load(checkpoint_path, device="cpu", jit=False) - pt_model = pt_model.eval() - - copy_text_model_and_projection(hf_model, pt_model) - copy_vison_model_and_projection(hf_model, pt_model) - hf_model.logit_scale = pt_model.logit_scale - - # Use `eos_token` so the example is more meaningful - input_ids = torch.tensor( - [ - [config.text_config.bos_token_id] - + list(range(3, 77)) - + [config.text_config.eos_token_id] - + [config.text_config.pad_token_id] - ] - ) - pixel_values = torch.randn(1, 3, 224, 224) - - hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True) - hf_logits_per_image = hf_outputs.logits_per_image - hf_logits_per_text = hf_outputs.logits_per_text - pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids) - - assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3) - assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py deleted file mode 100644 index c614d61e5b3d..000000000000 --- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg.""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import ( - CLIPSegConfig, - CLIPSegForImageSegmentation, - CLIPSegProcessor, - CLIPSegTextConfig, - CLIPSegVisionConfig, - CLIPTokenizer, - ViTImageProcessor, -) - - -def get_clipseg_config(model_name): - text_config = CLIPSegTextConfig() - vision_config = CLIPSegVisionConfig(patch_size=16) - - use_complex_transposed_convolution = True if "refined" in model_name else False - reduce_dim = 16 if "rd16" in model_name else 64 - - config = CLIPSegConfig.from_text_vision_configs( - text_config, - vision_config, - use_complex_transposed_convolution=use_complex_transposed_convolution, - reduce_dim=reduce_dim, - ) - return config - - -def rename_key(name): - # update prefixes - if "clip_model" in name: - name = name.replace("clip_model", "clip") - if "transformer" in name: - if "visual" in name: - name = name.replace("visual.transformer", "vision_model") - else: - name = name.replace("transformer", "text_model") - if "resblocks" in name: - name = name.replace("resblocks", "encoder.layers") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "attn" in name and "self" not in name: - name = name.replace("attn", "self_attn") - # text encoder - if "token_embedding" in name: - name = name.replace("token_embedding", "text_model.embeddings.token_embedding") - if "positional_embedding" in name and "visual" not in name: - name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight") - if "ln_final" in name: - name = name.replace("ln_final", "text_model.final_layer_norm") - # vision encoder - if "visual.class_embedding" in name: - name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding") - if "visual.conv1" in name: - name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding") - if "visual.positional_embedding" in name: - name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight") - if "visual.ln_pre" in name: - name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm") - if "visual.ln_post" in name: - name = name.replace("visual.ln_post", "vision_model.post_layernorm") - # projection layers - if "visual.proj" in name: - name = name.replace("visual.proj", "visual_projection.weight") - if "text_projection" in name: - name = name.replace("text_projection", "text_projection.weight") - # decoder - if "trans_conv" in name: - name = name.replace("trans_conv", "transposed_convolution") - if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name: - name = "decoder." + name - if "blocks" in name: - name = name.replace("blocks", "decoder.layers") - if "linear1" in name: - name = name.replace("linear1", "mlp.fc1") - if "linear2" in name: - name = name.replace("linear2", "mlp.fc2") - if "norm1" in name and "layer_" not in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "layer_" not in name: - name = name.replace("norm2", "layer_norm2") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if key.startswith("clip_model") and "attn.in_proj" in key: - key_split = key.split(".") - if "visual" in key: - layer_num = int(key_split[4]) - dim = config.vision_config.hidden_size - prefix = "vision_model" - else: - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - prefix = "text_model" - - if "weight" in key: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - elif "self_attn" in key and "out_proj" not in key: - key_split = key.split(".") - layer_num = int(key_split[1]) - dim = config.reduce_dim - if "weight" in key: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - if "visual_projection" in new_name or "text_projection" in new_name: - val = val.T - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub): - config = get_clipseg_config(model_name) - model = CLIPSegForImageSegmentation(config) - model.eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu") - - # remove some keys - for key in state_dict.copy().keys(): - if key.startswith("model"): - state_dict.pop(key, None) - - # rename some keys - state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - - if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]: - raise ValueError("Missing keys that are not expected: {}".format(missing_keys)) - if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]: - raise ValueError(f"Unexpected keys: {unexpected_keys}") - - image_processor = ViTImageProcessor(size=352) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer) - - image = prepare_img() - text = ["a glass", "something to fill", "wood", "a jar"] - - inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - # verify values - expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645]) - expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328]) - if model_name == "clipseg-rd64-refined": - expected_masks_slice = torch.tensor( - [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]] - ) - elif model_name == "clipseg-rd64": - expected_masks_slice = torch.tensor( - [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]] - ) - elif model_name == "clipseg-rd16": - expected_masks_slice = torch.tensor( - [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]] - ) - else: - raise ValueError(f"Model name {model_name} not supported.") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3) - assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3) - assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor for {model_name} to the hub") - model.push_to_hub(f"CIDAS/{model_name}") - processor.push_to_hub(f"CIDAS/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="clipseg-rd64", - type=str, - choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"], - help=( - "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning" - " reduce dimension)" - ), - ) - parser.add_argument( - "--checkpoint_path", - default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth", - type=str, - help=( - "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and" - " the decoder weights." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/clvp/convert_clvp_to_hf.py b/src/transformers/models/clvp/convert_clvp_to_hf.py deleted file mode 100644 index 4ae6fd425497..000000000000 --- a/src/transformers/models/clvp/convert_clvp_to_hf.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Weights conversion script for CLVP -""" - -import argparse -import os - -import torch -from huggingface_hub import hf_hub_download - -from transformers import ClvpConfig, ClvpModelForConditionalGeneration - - -_MODELS = { - "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth", - "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth", -} - -dim = 1024 -sub_dim = dim // 16 - -CLVP_ENCODERS_MAPPING = { - "text_transformer.transformer.attn_layers": "text_encoder_model", - "speech_transformer.transformer.attn_layers": "speech_encoder_model", - "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm", - "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm", - "to_text_latent": "text_encoder_model.projection", - "to_speech_latent": "speech_encoder_model.projection", - "text_emb": "text_encoder_model.token_embedding", - "speech_emb": "speech_encoder_model.token_embedding", - "1.wrap.net.0": "mlp.fc1", - "1.wrap.net.3": "mlp.fc2", - "1.wrap": "self_attn", - "to_out": "out_proj", - "to_q": "q_proj", - "to_k": "k_proj", - "to_v": "v_proj", - "temperature": "logit_scale", -} - -CLVP_DECODER_MAPPING = { - "conditioning_encoder.init": "conditioning_encoder.mel_conv", - "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks", - "mel_attn_blocks": "group_norms", - ".norm.weight": ".weight", - ".norm.bias": ".bias", - "text_embedding": "conditioning_encoder.text_token_embedding", - "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding", - "final_norm": "speech_decoder_model.final_norm", - "mel_head": "speech_decoder_model.lm_head", - "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm", - "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer", - "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer", - "gpt.h": "speech_decoder_model.model.decoder.layers", - "ln_1": "input_layernorm", - "ln_2": "post_attention_layernorm", -} - - -def update_index(present_index): - if present_index % 2 == 0: - return int(present_index / 2) - else: - return int((present_index - 1) / 2) - - -def convert_encoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - # for input_rmsnorm.weight and post_attention_rmsnorm.weight - if "0.0.g" in updated_key: - present_index = updated_key.split(".")[4] - if int(present_index) % 2 == 0: - updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight") - else: - updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight") - - if "transformer.attn_layers.layers" in updated_key: - present_index = updated_key.split(".")[4] - updated_index = update_index(int(present_index)) - updated_key = updated_key.replace( - f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}" - ) - - for k, v in CLVP_ENCODERS_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def convert_decoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - if len(updated_key.split(".")) > 3: - index, attr = updated_key.split(".")[2], updated_key.split(".")[-1] - - # for decoder attention - if "attn.c_attn" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).T.split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.q_proj.{attr}"] = slice1 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.k_proj.{attr}"] = slice2 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.v_proj.{attr}"] = slice3 - continue - - if "attn.c_proj" in updated_key: - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.out_proj.{attr}"] = ( - original_weights[updated_key].squeeze(-1).T - ) - continue - - if "attn.bias" in updated_key or "attn.masked_bias" in updated_key or "text_head" in updated_key: - original_weights.pop(updated_key) - continue - - # conditional encoder attention - if "qkv" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - - indices = torch.arange(dim) - index1, index2, index3 = ( - indices.unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[sub_dim:].unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[2 * sub_dim :].unfold(0, sub_dim, sub_dim * 3).flatten(), - ) - - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.q_proj.{attr}"] = torch.concatenate( - [slice1[index1], slice2[index3], slice3[index2]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.k_proj.{attr}"] = torch.concatenate( - [slice1[index2], slice2[index1], slice3[index3]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.v_proj.{attr}"] = torch.concatenate( - [slice1[index3], slice2[index2], slice3[index1]], - axis=0, - ) - continue - - if "proj_out" in updated_key: - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.out_proj.{attr}"] = original_weights[ - updated_key - ].squeeze(-1) - continue - - for k, v in CLVP_DECODER_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def _download(url: str, root: str): - repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}" - filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}" - hf_hub_download( - repo_id=repo_id, - filename=filename, - force_filename=root, - local_dir_use_symlinks=False, - ) - - -def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path): - converted_checkpoint = {} - - for each_model_name, each_model_url in _MODELS.items(): - each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1]) - if not os.path.exists(each_model_path): - print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}") - _download(url=each_model_url, root=each_model_path) - - if each_model_name == "clvp": - clvp_checkpoint = torch.load(each_model_path, map_location="cpu") - else: - decoder_checkpoint = torch.load(each_model_path, map_location="cpu") - - # Converting the weights - converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint)) - converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint)) - - config = ClvpConfig.from_pretrained("susnato/clvp_dev") - model = ClvpModelForConditionalGeneration(config) - - model.load_state_dict(converted_checkpoint, strict=True) - model.save_pretrained(pytorch_dump_folder_path) - print(f"Model saved at {pytorch_dump_folder_path}!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # # Required parameters - parser.add_argument( - "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model. (Please enter full path)", - ) - args = parser.parse_args() - - convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py b/src/transformers/models/colpali/convert_colpali_weights_to_hf.py deleted file mode 100644 index 1b30f3f97acd..000000000000 --- a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert ColPali weights from the original repository to the HF model format. - -Original repository: https://github.com/illuin-tech/colpali. - -NOTE: This script was originally run using `torch==2.5.1` and with: - -```bash -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf-internal \ - --push_to_hub - -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.3-merged \ - --revision 5b955e3415a7c5468ab33119d98d6d45c3a5b2c3 \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.3-hf \ - --push_to_hub -``` -""" - -import argparse -import glob -from pathlib import Path -from typing import Any, Dict, Optional - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import AutoConfig -from transformers.models.colpali import ColPaliForRetrieval -from transformers.models.colpali.configuration_colpali import ColPaliConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_DTYPE = torch.bfloat16 - - -def rename_state_dict_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]: - new_state_dict = {} - for key, value in state_dict.items(): - new_key = key - if key.startswith("custom_text_proj"): - new_key = key.replace("custom_text_proj", "embedding_proj_layer") - if key.startswith("model."): - new_key = key.replace("model.", "vlm.", 1) - new_state_dict[new_key] = value - return new_state_dict - - -def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]: - directory_path = snapshot_download( - repo_id=model_id, - revision=revision, - allow_patterns=["*.safetensors"], - ) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict. - if "lm_head.weight" not in original_state_dict: - original_state_dict["vlm.language_model.lm_head.weight"] = original_state_dict[ - "model.language_model.model.embed_tokens.weight" - ].clone() - - return original_state_dict - - -@torch.no_grad() -def convert_colpali_weights_to_hf( - model_id: str, - output_dir: str, - push_to_hub: bool, - revision: Optional[str] = None, - original_vlm_name_or_path: Optional[str] = None, -): - # Load the original model data - original_config = AutoConfig.from_pretrained( - model_id, - revision=revision, - ) - if original_vlm_name_or_path is not None: - original_config._name_or_path = original_vlm_name_or_path - if hasattr(original_config, "architectures"): - delattr(original_config, "architectures") - - original_state_dict = load_original_state_dict(model_id, revision=revision) - - # Format the state_dict keys - original_state_dict = rename_state_dict_keys(original_state_dict) - - # Create the new config - config = ColPaliConfig( - vlm_config=original_config, - embedding_dim=128, # hardcoded in the original model - ) - config.model_type = "colpali" - config.is_composition = False - - # Load the untrained model - model = ColPaliForRetrieval(config=config).to("cpu").eval() - print("Created model with new config and randomly initialized weights") - - # NOTE: The model was initialized with float32 weights. We need to convert it to the desired precision. - # There are two ways to set the model's dtype: - # - Using `model.from_pretrained(..., torch_dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision. - # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision. - # The following snippet allows a fine-grained control over the model's dtype, making sure that all - # the new weights' dtypes match the original model. - for param in model.parameters(): - param.data = param.data.to(ORIGINAL_DTYPE) - print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`") - - # Load the original weights - model.load_state_dict(original_state_dict) - print("Loaded original model weights") - - # Tie the weights (following ColPali's `__init__`` step) - if model.vlm.language_model._tied_weights_keys is not None: - model._tied_weights_keys = [f"vlm.language_model.{k}" for k in model.vlm.language_model._tied_weights_keys] - - # Sanity check: ensure all keys are the same - state_dict_keys_old = set(original_state_dict.keys()) - state_dict_keys_new = set(model.state_dict().keys()) - disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new) - if disjoint_keys: - raise ValueError(f"Incompatible keys: {disjoint_keys}") - - # Save the model - if push_to_hub: - model.push_to_hub(output_dir, private=True) - print(f"Model pushed to the hub at `{output_dir}`") - else: - Path(output_dir).mkdir(exist_ok=True, parents=True) - model.save_pretrained(output_dir) - print(f"Model saved to `{output_dir}`") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - This script converts the original ColPali model to the HF model format. - - Example usage: - ```bash - python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf \ - --push_to_hub - ``` - """ - ) - parser.add_argument( - "--model_id", - help="Model ID of the original model to convert", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally", - action="store_true", - default=False, - ) - parser.add_argument( - "--revision", - help="Revision of the model to download", - default=None, - ) - parser.add_argument( - "--original_vlm_name_or_path", - help="Name or path of the original VLM backbone model", - default=None, - ) - args = parser.parse_args() - - convert_colpali_weights_to_hf( - model_id=args.model_id, - output_dir=args.output_dir, - push_to_hub=args.push_to_hub, - revision=args.revision, - original_vlm_name_or_path=args.original_vlm_name_or_path, - ) diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 91f00668be69..000000000000 --- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,324 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Conditional DETR checkpoints.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - ConditionalDetrConfig, - ConditionalDetrForObjectDetection, - ConditionalDetrForSegmentation, - ConditionalDetrImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # q, k, v projections in self/cross-attention in decoder for conditional DETR - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight") - ) - - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias") - ) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -# for conditional DETR, also convert reference point head and query scale MLP -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"), - ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"), - ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"), - ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"), - ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"), - ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"), - ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"), - ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"), - ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"), - ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "conditional_detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure. - """ - - # load default config - config = ConditionalDetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = ConditionalDetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval() - state_dict = conditional_detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "conditional_detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "conditional_detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("conditional_detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["conditional_detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["conditional_detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model") - # verify our conversion - original_outputs = conditional_detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="conditional_detr_resnet50", - type=str, - help="Name of the CONDITIONAL_DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py deleted file mode 100644 index 3d4ff779874b..000000000000 --- a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvBERT checkpoint.""" - -import argparse - -from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path): - conf = ConvBertConfig.from_json_file(convbert_config_file) - model = ConvBertModel(conf) - - model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path) - model.save_pretrained(pytorch_dump_path) - - tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True) - tf_model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--convbert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ConvBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py deleted file mode 100644 index 27315ed73f91..000000000000 --- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py +++ /dev/null @@ -1,242 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNext checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnext_config(checkpoint_url): - config = ConvNextConfig() - - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "small" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "xlarge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [256, 512, 1024, 2048] - - if "1k" in checkpoint_url: - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - else: - num_labels = 21841 - filename = "imagenet-22k-id2label.json" - expected_shape = (1, 21841) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - if "1k" not in checkpoint_url: - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "gamma" in name: - name = name.replace("gamma", "layer_scale_parameter") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our ConvNext structure. - """ - - # define ConvNext configuration based on URL - config, expected_shape = get_convnext_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnext." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - size = 224 if "224" in checkpoint_url else 384 - image_processor = ConvNextImageProcessor(size=size) - pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values - - logits = model(pixel_values).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth": - expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth": - expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth": - expected_logits = torch.tensor([0.4525, 0.7539, 0.0308]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth": - expected_logits = torch.tensor([0.3561, 0.6350, -0.0384]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth": - expected_logits = torch.tensor([0.4174, -0.0989, 0.1489]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth": - expected_logits = torch.tensor([0.2513, -0.1349, -0.1613]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth": - expected_logits = torch.tensor([1.2980, 0.3631, -0.1198]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth": - expected_logits = torch.tensor([1.2963, 0.1227, 0.1723]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth": - expected_logits = torch.tensor([1.7956, 0.8390, 0.2820]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth": - expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth": - expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth": - expected_logits = torch.tensor([0.2681, 0.2365, 0.6246]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth": - expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth": - expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth": - expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - print("Pushing model to the hub...") - model_name = "convnext" - if "tiny" in checkpoint_url: - model_name += "-tiny" - elif "small" in checkpoint_url: - model_name += "-small" - elif "base" in checkpoint_url: - model_name += "-base" - elif "xlarge" in checkpoint_url: - model_name += "-xlarge" - elif "large" in checkpoint_url: - model_name += "-large" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - if "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", - type=str, - help="URL of the original ConvNeXT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py deleted file mode 100644 index 8094ecf0d615..000000000000 --- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py +++ /dev/null @@ -1,286 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNeXTV2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -import os - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnextv2_config(checkpoint_url): - config = ConvNextV2Config() - - if "atto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [40, 80, 160, 320] - if "femto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [48, 96, 192, 384] - if "pico" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [64, 128, 256, 512] - if "nano" in checkpoint_url: - depths = [2, 2, 8, 2] - hidden_sizes = [80, 160, 320, 640] - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "huge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [352, 704, 1408, 2816] - - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "gamma" in name: - name = name.replace("gamma", "weight") - if "beta" in name: - name = name.replace("beta", "bias") - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_preprocessor(checkpoint_url): - if "224" in checkpoint_url: - size = 224 - crop_pct = 224 / 256 - elif "384" in checkpoint_url: - size = 384 - crop_pct = None - else: - size = 512 - crop_pct = None - - return ConvNextImageProcessor( - size=size, - crop_pct=crop_pct, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - resample=PILImageResampling.BICUBIC, - ) - - -@torch.no_grad() -def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ConvNeXTV2 structure. - """ - print("Downloading original model from checkpoint...") - # define ConvNeXTV2 configuration based on URL - config, expected_shape = get_convnextv2_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - - print("Converting model parameters...") - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnextv2." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextV2ForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - preprocessor = convert_preprocessor(checkpoint_url) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - logits = model(**inputs).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt": - expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt": - expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt": - expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt": - expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt": - expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt": - expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt": - expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt": - expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt": - expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt": - expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt": - expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt": - expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt": - expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - print("Model outputs match the original results!") - - if save_model: - print("Saving model to local...") - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - - model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - model_name = "convnextv2" - if "atto" in checkpoint_url: - model_name += "-atto" - if "femto" in checkpoint_url: - model_name += "-femto" - if "pico" in checkpoint_url: - model_name += "-pico" - if "nano" in checkpoint_url: - model_name += "-nano" - elif "tiny" in checkpoint_url: - model_name += "-tiny" - elif "base" in checkpoint_url: - model_name += "-base" - elif "large" in checkpoint_url: - model_name += "-large" - elif "huge" in checkpoint_url: - model_name += "-huge" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - elif "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - elif "1k" in checkpoint_url: - model_name += "-1k" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - elif "512" in checkpoint_url: - model_name += "-512" - - if push_to_hub: - print(f"Pushing {model_name} to the hub...") - model.push_to_hub(model_name) - preprocessor.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt", - type=str, - help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub") - - args = parser.parse_args() - convert_convnextv2_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub - ) diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 9f76c92887f4..000000000000 --- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,362 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CvT checkpoints from the original repository. - -URL: https://github.com/microsoft/CvT""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import torch -from huggingface_hub import hf_hub_download - -from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification - - -def embeddings(idx): - """ - The function helps in renaming embedding layer weights. - - Args: - idx: stage number in original model - """ - embed = [] - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight", - f"stage{idx}.patch_embed.proj.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias", - f"stage{idx}.patch_embed.proj.bias", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight", - f"stage{idx}.patch_embed.norm.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias", - f"stage{idx}.patch_embed.norm.bias", - ) - ) - return embed - - -def attention(idx, cnt): - """ - The function helps in renaming attention block layers weights. - - Args: - idx: stage number in original model - cnt: count of blocks in each stage - """ - attention_weights = [] - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_q.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_q.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_k.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_k.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_v.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_v.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight", - f"stage{idx}.blocks.{cnt}.attn.proj.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias", - f"stage{idx}.blocks.{cnt}.attn.proj.bias", - ) - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias") - ) - return attention_weights - - -def cls_token(idx): - """ - Function helps in renaming cls_token weights - """ - token = [] - token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token")) - return token - - -def final(): - """ - Function helps in renaming final classification layer - """ - head = [] - head.append(("layernorm.weight", "norm.weight")) - head.append(("layernorm.bias", "norm.bias")) - head.append(("classifier.weight", "head.weight")) - head.append(("classifier.bias", "head.bias")) - return head - - -def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder): - """ - Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint - """ - img_labels_file = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id) - - # For depth size 13 (13 = 1+2+10) - if cvt_model.rsplit("/", 1)[-1][4:6] == "13": - config.depth = [1, 2, 10] - - # For depth size 21 (21 = 1+4+16) - elif cvt_model.rsplit("/", 1)[-1][4:6] == "21": - config.depth = [1, 4, 16] - - # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20) - else: - config.depth = [2, 2, 20] - config.num_heads = [3, 12, 16] - config.embed_dim = [192, 768, 1024] - - model = CvtForImageClassification(config) - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.size["shortest_edge"] = image_size - original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu")) - - huggingface_weights = OrderedDict() - list_of_state_dict = [] - - for idx in range(len(config.depth)): - if config.cls_token[idx]: - list_of_state_dict = list_of_state_dict + cls_token(idx) - list_of_state_dict = list_of_state_dict + embeddings(idx) - for cnt in range(config.depth[idx]): - list_of_state_dict = list_of_state_dict + attention(idx, cnt) - - list_of_state_dict = list_of_state_dict + final() - for gg in list_of_state_dict: - print(gg) - for i in range(len(list_of_state_dict)): - huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]] - - model.load_state_dict(huggingface_weights) - model.save_pretrained(pytorch_dump_folder) - image_processor.save_pretrained(pytorch_dump_folder) - - -# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--cvt_model", - default="cvt-w24", - type=str, - help="Name of the cvt model you'd like to convert.", - ) - parser.add_argument( - "--image_size", - default=384, - type=int, - help="Input Image Size", - ) - parser.add_argument( - "--cvt_file_name", - default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth", - type=str, - help="Input Image Size", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/dac/convert_dac_checkpoint.py b/src/transformers/models/dac/convert_dac_checkpoint.py deleted file mode 100644 index bfeb96fbdd4e..000000000000 --- a/src/transformers/models/dac/convert_dac_checkpoint.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding=utf-8 -# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import fnmatch -import re - -import torch - -from transformers import ( - DacConfig, - DacFeatureExtractor, - DacModel, - logging, -) - - -# checkpoints downloaded using: -# pip install descript-audio-codec -# python3 -m dac download # downloads the default 44kHz variant -# python3 -m dac download --model_type 44khz # downloads the 44kHz variant -# python3 -m dac download --model_type 24khz # downloads the 24kHz variant -# python3 -m dac download --model_type 16khz # downloads the 16kHz variant -# More informations: https://github.com/descriptinc/descript-audio-codec/tree/main - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.dac") - - -def match_pattern(string, pattern): - # Split the pattern into parts - pattern_parts = pattern.split(".") - string_parts = string.split(".") - - pattern_block_count = string_block_count = 0 - - for part in pattern_parts: - if part.startswith("block"): - pattern_block_count += 1 - - for part in string_parts: - if part.startswith("block"): - string_block_count += 1 - - return fnmatch.fnmatch(string, pattern) and string_block_count == pattern_block_count - - -TOP_LEVEL_KEYS = [] -IGNORE_KEYS = [] - - -MAPPING_ENCODER = { - "encoder.block.0": ["encoder.conv1"], - "encoder.block.5": ["encoder.snake1"], - "encoder.block.6": ["encoder.conv2"], - "encoder.block.*.block.*.block.0".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake1"], - "encoder.block.*.block.*.block.1".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv1"], - "encoder.block.*.block.*.block.2".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake2"], - "encoder.block.*.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv2"], - "encoder.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "snake1"], - "encoder.block.*.block.4".replace("*", r"\d+"): ["encoder.block", "conv1"], -} - -MAPPING_QUANTIZER = { - "quantizer.quantizers.*": ["quantizer.quantizers.*"], -} - -MAPPING_DECODER = { - "decoder.model.0": ["decoder.conv1"], - "decoder.model.5": ["decoder.snake1"], - "decoder.model.6": ["decoder.conv2"], - "decoder.model.*.block.0".replace("*", r"\d+"): ["decoder.block", "snake1"], - "decoder.model.*.block.1".replace("*", r"\d+"): ["decoder.block", "conv_t1"], - "decoder.model.*.block.*.block.0".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake1"], - "decoder.model.*.block.*.block.1".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv1"], - "decoder.model.*.block.*.block.2".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake2"], - "decoder.model.*.block.*.block.3".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv2"], -} - - -MAPPING = { - **MAPPING_ENCODER, - **MAPPING_QUANTIZER, - **MAPPING_DECODER, -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - if hf_shape != value.shape: - raise ValueError( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "alpha": - hf_pointer.alpha.data = value - logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.") - - -def should_ignore(name, ignore_keys): - for key in ignore_keys: - if key.endswith(".*"): - if name.startswith(key[:-1]): - return True - elif ".*." in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - return True - elif key in name: - return True - return False - - -def recursively_load_weights(orig_dict, hf_model, model_name): - unused_weights = [] - - if model_name not in ["dac_16khz", "dac_24khz", "dac_44khz"]: - raise ValueError(f"Unsupported model: {model_name}") - - for name, value in orig_dict.items(): - is_used = False - for key, mapped_key in MAPPING.items(): - regex = re.compile(key) - if regex.search(name): - if len(mapped_key) == 1: - if mapped_key[0][0] == "q": - mapped_key = ".".join(name.split(".")[:-1]) - else: - mapped_key = mapped_key[0] - elif len(mapped_key) == 3: - integers = re.findall(r"\b\d+\b", name) - if mapped_key[0][0] == "d": - mapped_key = "{}.{}.{}{}.{}".format( - mapped_key[0], - str(int(integers[0]) - 1), - mapped_key[1], - str(int(integers[1]) - 1), - mapped_key[2], - ) - else: - mapped_key = "{}.{}.{}{}.{}".format( - mapped_key[0], - str(int(integers[0]) - 1), - mapped_key[1], - str(int(integers[1]) + 1), - mapped_key[2], - ) - elif len(mapped_key) == 2: - integers = re.findall(r"\b\d+\b", name) - mapped_key = "{}.{}.{}".format(mapped_key[0], str(int(integers[0]) - 1), mapped_key[1]) - - is_used = True - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "bias" in name: - weight_type = "bias" - elif "alpha" in name: - weight_type = "alpha" - elif "weight" in name: - weight_type = "weight" - set_recursively(hf_model, mapped_key, value, name, weight_type) - - if not is_used: - unused_weights.append(name) - - print(list(set(unused_weights))) - - logger.warning(f"Unused weights: {unused_weights}") - - -@torch.no_grad() -def convert_checkpoint( - model_name, - checkpoint_path, - pytorch_dump_folder_path, - sample_rate=16000, - repo_id=None, -): - model_dict = torch.load(checkpoint_path, "cpu") - - config = DacConfig() - - metadata = model_dict["metadata"]["kwargs"] - config.encoder_hidden_size = metadata["encoder_dim"] - config.downsampling_ratios = metadata["encoder_rates"] - config.codebook_size = metadata["codebook_size"] - config.n_codebooks = metadata["n_codebooks"] - config.codebook_dim = metadata["codebook_dim"] - config.decoder_hidden_size = metadata["decoder_dim"] - config.upsampling_ratios = metadata["decoder_rates"] - config.quantizer_dropout = float(metadata["quantizer_dropout"]) - config.sampling_rate = sample_rate - - model = DacModel(config) - feature_extractor = DacFeatureExtractor() - feature_extractor.sampling_rate = sample_rate - - original_checkpoint = model_dict["state_dict"] - - model.apply_weight_norm() - recursively_load_weights(original_checkpoint, model, model_name) - model.remove_weight_norm() - - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - feature_extractor.push_to_hub(repo_id) - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model", - default="dac_44khz", - type=str, - help="The model to convert. Should be one of 'dac_16khz', 'dac_24khz', 'dac_44khz'.", - ) - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor") - args = parser.parse_args() - - convert_checkpoint( - args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.sample_rate, args.push_to_hub - ) diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 5339f1671b07..000000000000 --- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Wav2Vec2 checkpoint.""" - -import argparse -import os -from functools import reduce - -import fairseq -import torch -from datasets import load_dataset - -from transformers import Wav2Vec2Processor, logging -from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig - -# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py -from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy # noqa: F401 -from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "models.0.layer_norm": "feature_projection.layer_norm", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "w2v_model.layer_norm": "feature_projection.layer_norm", - "w2v_encoder.proj": "lm_head", - "mask_emb": "masked_spec_embed", -} -TOP_LEVEL_KEYS = [ - "lm_head", -] - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - if hf_shape != value.shape: - raise ValueError( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model, is_headless): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - if not is_headless: - feature_extractor = hf_model.data2vec_audio.feature_extractor - pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed - - else: - feature_extractor = hf_model.feature_extractor - pos_conv_embedding = hf_model.encoder.pos_conv_embed - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - ) - is_used = True - elif "pos_conv" in name: - load_pos_conv_layer( - name, - value, - pos_conv_embedding, - unused_weights, - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - if not is_headless: - mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key - if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]: - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "bias" in name: - weight_type = "bias" - elif "weight" in name: - # TODO: don't match quantizer.weight_proj - weight_type = "weight" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def access_by_string(module, path): - names = path.split(".") - return reduce(getattr, names, module) - - -def set_weights(full_name, module, fsq_value, hf_weight_path): - hf_weight = access_by_string(module, hf_weight_path) - hf_value = hf_weight.data - - if fsq_value.shape != hf_value.shape: - raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.") - hf_weight.data = fsq_value - logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - weight_type = name.split(".")[-1] - if type_id == 0: - layer_type = "conv" - elif type_id == 2: - layer_type = "layer_norm" - else: - unused_weights.append(full_name) - return - - set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}") - - -def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights): - name = full_name.split("pos_conv.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - weight_type = name.split(".")[-1] - if type_id != 0: - unused_weights.append(full_name) - return - else: - layer_type = "conv" - - set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}") - - -@torch.no_grad() -def convert_wav2vec2_checkpoint( - checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = Data2VecAudioConfig.from_pretrained(config_path) - else: - config = Data2VecAudioConfig() - - if not is_finetuned: - # Modify final_proj layer name - hf_wav2vec = Data2VecAudioModel(config) - data2vec_checkpoint_dir = os.path.dirname(checkpoint_path) - - state_dict = torch.load(checkpoint_path) - state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight") - state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias") - converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt") - torch.save(state_dict, converted_ckpt) - else: - hf_wav2vec = Data2VecAudioForCTC(config) - converted_ckpt = checkpoint_path - - def load_data2vec(path): - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path]) - return model[0].eval() - - model = load_data2vec(converted_ckpt) - - recursively_load_weights(model, hf_wav2vec, not is_finetuned) - - processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60") - - ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) - input_audio = [x["array"] for x in ds[:4]["audio"]] - - inputs = processor(input_audio, return_tensors="pt", padding=True) - - input_values = inputs.input_values - attention_mask = inputs.attention_mask - # input_values = inputs.input_values[:, :-1] - # attention_mask = inputs.attention_mask[:, :-1] - - hf_wav2vec.eval() - model.eval() - if is_finetuned: - their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[ - "encoder_out" - ].transpose(0, 1) - our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"] - - pred_ids = torch.argmax(our_output, dim=-1) - output_string = processor.batch_decode(pred_ids) - - print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}") - else: - their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[ - "layer_results" - ][-1][0].transpose(0, 1) - our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"] - - print(our_output.shape, their_output.shape) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 - success = torch.allclose(our_output, their_output, atol=1e-3) - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Something went wRoNg") - - hf_wav2vec.save_pretrained(pytorch_dump_folder_path) - - if is_finetuned: - processor.save_pretrained(pytorch_dump_folder_path) - else: - processor.feature_extractor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not" - ) - args = parser.parse_args() - convert_wav2vec2_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned - ) diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 10b97dc93d0a..000000000000 --- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,207 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert data2vec checkpoint.""" - -import argparse -import os -import pathlib - -import fairseq -import torch -from fairseq.modules import TransformerSentenceEncoderLayer -from packaging import version - -from transformers import ( - Data2VecTextConfig, - Data2VecTextForMaskedLM, - Data2VecTextForSequenceClassification, - Data2VecTextModel, -) -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertSelfAttention, - BertSelfOutput, -) - -# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz -# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py -from transformers.utils import logging - - -if version.parse(fairseq.__version__) < version.parse("0.9.0"): - raise Exception("requires fairseq >= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "Hello world! cĂ©cĂ© herlolip" - - -def convert_data2vec_checkpoint_to_pytorch( - data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool -): - """ - Copy/paste/tweak data2vec's weights to our BERT structure. - """ - data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path) - data2vec = Data2VecTextModel.from_pretrained( - data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name - ) - data2vec.eval() # disable dropout - data2vec_model = data2vec.models[0] - data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder - config = Data2VecTextConfig( - vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings, - hidden_size=data2vec_model.args.encoder_embed_dim, - num_hidden_layers=data2vec_model.args.encoder_layers, - num_attention_heads=data2vec_model.args.encoder_attention_heads, - intermediate_size=data2vec_model.args.encoder_ffn_embed_dim, - max_position_embeddings=514, - type_vocab_size=1, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - ) - if classification_head: - config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our BERT config:", config) - - model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight - model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight - model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - model.data2vec_text.embeddings.token_type_embeddings.weight - ) # just zero them out b/c data2vec doesn't use them. - model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight - model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: BertLayer = model.data2vec_text.encoder.layer[i] - data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.k_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.q_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.v_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - - self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight - self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias - self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight - self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias - self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight - self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias - - # self-attention output - self_output: BertSelfOutput = layer.attention.output - assert ( - self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape - ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}" - self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight - self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias - self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight - self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias - - # intermediate - intermediate: BertIntermediate = layer.intermediate - assert ( - intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape - ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}" - intermediate.dense.weight = data2vec_layer.fc1.weight - intermediate.dense.bias = data2vec_layer.fc1.bias - - # output - bert_output: BertOutput = layer.output - assert ( - bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape - ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}" - bert_output.dense.weight = data2vec_layer.fc2.weight - bert_output.dense.bias = data2vec_layer.fc2.bias - bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight - bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias - # end of layer - - if classification_head: - model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight - model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias - model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias - model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight - model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias - - # Let's check that we get the same results. - input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 - - our_output = model(input_ids)[0] - if classification_head: - their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids)) - else: - their_output = data2vec_model(input_ids)[0] - print(our_output.shape, their_output.shape) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 - success = torch.allclose(our_output, their_output, atol=1e-3) - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - args = parser.parse_args() - convert_data2vec_checkpoint_to_pytorch( - args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head - ) diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index 0c6f42f4ba7f..000000000000 --- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,374 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json - -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.models import create_model - -from transformers import ( - BeitImageProcessor, - Data2VecVisionConfig, - Data2VecVisionForImageClassification, - Data2VecVisionModel, -) - - -def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", f"{hf_prefix}embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - f"{hf_prefix}encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"), - ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def get_args(): - parser = argparse.ArgumentParser( - "Convert Data2VecVision to HF for image classification and pretraining", add_help=False - ) - parser.add_argument("--hf_checkpoint_name", type=str) - parser.add_argument("--input_size", default=224, type=int, help="images input size") - parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint") - - return parser.parse_args() - - -def load_beit_model(args, is_finetuned, is_large): - def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"): - missing_keys = [] - unexpected_keys = [] - error_msgs = [] - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, "_metadata", None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - def load(module, prefix=""): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs - ) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - load(model, prefix=prefix) - - warn_missing_keys = [] - ignore_missing_keys = [] - for key in missing_keys: - keep_flag = True - for ignore_key in ignore_missing.split("|"): - if ignore_key in key: - keep_flag = False - break - if keep_flag: - warn_missing_keys.append(key) - else: - ignore_missing_keys.append(key) - - missing_keys = warn_missing_keys - - if len(missing_keys) > 0: - print( - "Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys - ) - ) - if len(unexpected_keys) > 0: - print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)) - if len(ignore_missing_keys) > 0: - print( - "Ignored weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, ignore_missing_keys - ) - ) - if len(error_msgs) > 0: - print("\n".join(error_msgs)) - - model_kwargs = { - "pretrained": False, - "use_shared_rel_pos_bias": True, - "use_abs_pos_emb": False, - "init_values": 0.1, - } - - if is_finetuned: - model_kwargs.update( - { - "num_classes": 1000, - "use_mean_pooling": True, - "init_scale": 0.001, - "use_rel_pos_bias": True, - } - ) - - model = create_model( - "beit_large_patch16_224" if is_large else "beit_base_patch16_224", - **model_kwargs, - ) - patch_size = model.patch_embed.patch_size - args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) - checkpoint = torch.load(args.beit_checkpoint, map_location="cpu") - - print(f"Load ckpt from {args.beit_checkpoint}") - checkpoint_model = None - for model_key in ("model", "module"): - if model_key in checkpoint: - checkpoint_model = checkpoint[model_key] - print(f"Load state_dict by model_key = {model_key}") - break - - all_keys = list(checkpoint_model.keys()) - for key in all_keys: - if "relative_position_index" in key: - checkpoint_model.pop(key) - - if "relative_position_bias_table" in key: - rel_pos_bias = checkpoint_model[key] - src_num_pos, num_attn_heads = rel_pos_bias.size() - dst_num_pos, _ = model.state_dict()[key].size() - dst_patch_shape = model.patch_embed.patch_shape - if dst_patch_shape[0] != dst_patch_shape[1]: - raise NotImplementedError() - - load_state_dict(model, checkpoint_model, prefix="") - - return model - - -def main(): - args = get_args() - - is_finetuned = "ft1k" in args.hf_checkpoint_name - is_large = "large" in args.hf_checkpoint_name - - if is_finetuned: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py - # into this folder. - import modeling_finetune # noqa: F401 - else: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py - # into this folder - # IMPORTANT: Note that for now we've only converted the down-stream - # model and not the full pretrained model. This means for the integration - # test you need to add a `return x` after the following line: - # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197 - # to make the integration test pass. - import modeling_cyclical # noqa: F401 - - # 1. Create model config - config = Data2VecVisionConfig() - if is_finetuned: - config.use_relative_position_bias = True - config.use_shared_relative_position_bias = False - config.use_mean_pooling = True - config.num_labels = 1000 - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - else: - config.use_relative_position_bias = False - config.use_shared_relative_position_bias = True - config.use_mean_pooling = False - - if is_large: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # 2. Load Beit model - orig_model = load_beit_model(args, is_finetuned, is_large) - orig_model.eval() - - # 3. Forward Beit model - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png") - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - orig_args = (pixel_values,) if is_finetuned else (pixel_values, None) - with torch.no_grad(): - orig_model_output = orig_model(*orig_args) - - # 4. Load HF Data2VecVision model - if is_finetuned: - hf_model = Data2VecVisionForImageClassification(config) - hf_model.eval() - has_lm_head = False - hf_prefix = "data2vec_vision." - else: - hf_model = Data2VecVisionModel(config) - hf_model.eval() - has_lm_head = True - hf_prefix = "" - - rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - state_dict = orig_model.state_dict() - for src, dest in rename_keys: - val = state_dict.pop(src) - state_dict[dest] = val - - read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - print("HF missing", missing_keys) - print("HF unexpected_keys", unexpected_keys) - - # 5. Forward HF Data2VecVision model - with torch.no_grad(): - hf_model_output = hf_model(pixel_values) - - hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state - - # 6. Compare - max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item() - - print(f"max_absolute_diff = {max_absolute_diff}") - success = torch.allclose(hf_output, orig_model_output, atol=1e-3) - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Something went wRoNg") - - # 7. Save - print(f"Saving to {args.hf_checkpoint_name}") - hf_model.save_pretrained(args.hf_checkpoint_name) - image_processor.save_pretrained(args.hf_checkpoint_name) - - -if __name__ == "__main__": - main() - # Run the following to convert checkpoints - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base-ft1k" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large-ft1k" diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py deleted file mode 100644 index 781b823e96f3..000000000000 --- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py +++ /dev/null @@ -1,236 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Deformable DETR checkpoints.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_key(orig_key): - if "backbone.0.body" in orig_key: - orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") - if "transformer" in orig_key: - orig_key = orig_key.replace("transformer.", "") - if "norm1" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm1", "self_attn_layer_norm") - else: - orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") - if "norm2" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm2", "final_layer_norm") - else: - orig_key = orig_key.replace("norm2", "self_attn_layer_norm") - if "norm3" in orig_key: - orig_key = orig_key.replace("norm3", "final_layer_norm") - if "linear1" in orig_key: - orig_key = orig_key.replace("linear1", "fc1") - if "linear2" in orig_key: - orig_key = orig_key.replace("linear2", "fc2") - if "query_embed" in orig_key: - orig_key = orig_key.replace("query_embed", "query_position_embeddings") - if "cross_attn" in orig_key: - orig_key = orig_key.replace("cross_attn", "encoder_attn") - - return orig_key - - -def read_in_q_k_v(state_dict): - # transformer decoder self-attention layers - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deformable_detr_checkpoint( - checkpoint_path, - single_scale, - dilation, - with_box_refine, - two_stage, - pytorch_dump_folder_path, - push_to_hub, -): - """ - Copy/paste/tweak model's weights to our Deformable DETR structure. - """ - - # load default config - config = DeformableDetrConfig() - # set config attributes - if single_scale: - config.num_feature_levels = 1 - config.dilation = dilation - config.with_box_refine = with_box_refine - config.two_stage = two_stage - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - image_processor = DeformableDetrImageProcessor(format="coco_detection") - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "model." - for key in state_dict.copy().keys(): - if not key.startswith("class_embed") and not key.startswith("bbox_embed"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DeformableDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - # verify our conversion - outputs = model(pixel_values.to(device)) - - expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ) - expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) - - if single_scale: - expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] - ) - expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) - - if single_scale and dilation: - expected_logits = torch.tensor( - [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] - ) - expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) - - if with_box_refine: - expected_logits = torch.tensor( - [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] - ) - expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) - - if with_box_refine and two_stage: - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ) - expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) - - print("Logits:", outputs.logits[0, :3, :3]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - - print("Everything ok!") - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - model_name = "deformable-detr" - model_name += "-single-scale" if single_scale else "" - model_name += "-dc5" if dilation else "" - model_name += "-with-box-refine" if with_box_refine else "" - model_name += "-two-stage" if two_stage else "" - print("Pushing model to hub...") - model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - type=str, - default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth", - help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", - ) - parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") - parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") - parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") - parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deformable_detr_checkpoint( - args.checkpoint_path, - args.single_scale, - args.dilation, - args.with_box_refine, - args.two_stage, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py deleted file mode 100644 index e7bf3e7a12e8..000000000000 --- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DeiT distilled checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - ("cls_token", "deit.embeddings.cls_token"), - ("dist_token", "deit.embeddings.distillation_token"), - ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"), - ("pos_embed", "deit.embeddings.position_embeddings"), - ] - ) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "deit" from all keys that start with "deit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys] - else: - # layernorm + classification heads - rename_keys.extend( - [ - ("norm.weight", "deit.layernorm.weight"), - ("norm.bias", "deit.layernorm.bias"), - ("head.weight", "cls_classifier.weight"), - ("head.bias", "cls_classifier.bias"), - ("head_dist.weight", "distillation_classifier.weight"), - ("head_dist.bias", "distillation_classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "deit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DeiT structure. - """ - - # define default DeiT configuration - config = DeiTConfig() - # all deit models have fine-tuned heads - base_model = False - # dataset (fine-tuned on ImageNet 2012), patch_size and image_size - config.num_labels = 1000 - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.patch_size = int(deit_name[-6:-4]) - config.image_size = int(deit_name[-3:]) - # size of the architecture - if deit_name[9:].startswith("tiny"): - config.hidden_size = 192 - config.intermediate_size = 768 - config.num_hidden_layers = 12 - config.num_attention_heads = 3 - elif deit_name[9:].startswith("small"): - config.hidden_size = 384 - config.intermediate_size = 1536 - config.num_hidden_layers = 12 - config.num_attention_heads = 6 - if deit_name[9:].startswith("base"): - pass - elif deit_name[4:].startswith("large"): - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # load original model from timm - timm_model = timm.create_model(deit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - # load HuggingFace model - model = DeiTForImageClassificationWithTeacher(config).eval() - model.load_state_dict(state_dict) - - # Check outputs on an image, prepared by DeiTImageProcessor - size = int( - (256 / 224) * config.image_size - ) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103 - image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size) - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values) - - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {deit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--deit_name", - default="vit_deit_base_distilled_patch16_224", - type=str, - help="Name of the DeiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py deleted file mode 100644 index e2f64e9c3cd1..000000000000 --- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py +++ /dev/null @@ -1,318 +0,0 @@ -# coding=utf-8 -# Copyright 2020, The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bort checkpoint.""" - -import argparse -import os - -import gluonnlp as nlp -import mxnet as mx -import numpy as np -import torch -from gluonnlp.base import get_home_dir -from gluonnlp.model.bert import BERTEncoder -from gluonnlp.model.utils import _load_vocab -from gluonnlp.vocab import Vocab -from packaging import version -from torch import nn - -from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -if version.parse(nlp.__version__) != version.parse("0.8.3"): - raise Exception("requires gluonnlp == 0.8.3") - -if version.parse(mx.__version__) != version.parse("1.5.0"): - raise Exception("requires mxnet == 1.5.0") - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!" - - -def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str): - """ - Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure- - """ - - # Original Bort configuration - bort_4_8_768_1024_hparams = { - "attention_cell": "multi_head", - "num_layers": 4, - "units": 1024, - "hidden_size": 768, - "max_length": 512, - "num_heads": 8, - "scaled": True, - "dropout": 0.1, - "use_residual": True, - "embed_size": 1024, - "embed_dropout": 0.1, - "word_embed": None, - "layer_norm_eps": 1e-5, - "token_type_vocab_size": 2, - } - - predefined_args = bort_4_8_768_1024_hparams - - # Let's construct the original Bort model here - # Taken from official BERT implementation, see: - # https://github.com/alexa/bort/blob/master/bort/bort.py - encoder = BERTEncoder( - attention_cell=predefined_args["attention_cell"], - num_layers=predefined_args["num_layers"], - units=predefined_args["units"], - hidden_size=predefined_args["hidden_size"], - max_length=predefined_args["max_length"], - num_heads=predefined_args["num_heads"], - scaled=predefined_args["scaled"], - dropout=predefined_args["dropout"], - output_attention=False, - output_all_encodings=False, - use_residual=predefined_args["use_residual"], - activation=predefined_args.get("activation", "gelu"), - layer_norm_eps=predefined_args.get("layer_norm_eps", None), - ) - - # Vocab information needs to be fetched first - # It's the same as RoBERTa, so RobertaTokenizer can be used later - vocab_name = "openwebtext_ccnews_stories_books_cased" - - # Specify download folder to Gluonnlp's vocab - gluon_cache_dir = os.path.join(get_home_dir(), "models") - bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab) - - original_bort = nlp.model.BERTModel( - encoder, - len(bort_vocab), - units=predefined_args["units"], - embed_size=predefined_args["embed_size"], - embed_dropout=predefined_args["embed_dropout"], - word_embed=predefined_args["word_embed"], - use_pooler=False, - use_token_type_embed=False, - token_type_vocab_size=predefined_args["token_type_vocab_size"], - use_classifier=False, - use_decoder=False, - ) - - original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True) - params = original_bort._collect_params_with_prefix() - - # Build our config đŸ€— - hf_bort_config_json = { - "architectures": ["BertForMaskedLM"], - "attention_probs_dropout_prob": predefined_args["dropout"], - "hidden_act": "gelu", - "hidden_dropout_prob": predefined_args["dropout"], - "hidden_size": predefined_args["embed_size"], - "initializer_range": 0.02, - "intermediate_size": predefined_args["hidden_size"], - "layer_norm_eps": predefined_args["layer_norm_eps"], - "max_position_embeddings": predefined_args["max_length"], - "model_type": "bort", - "num_attention_heads": predefined_args["num_heads"], - "num_hidden_layers": predefined_args["num_layers"], - "pad_token_id": 1, # 2 = BERT, 1 = RoBERTa - "type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa - "vocab_size": len(bort_vocab), - } - - hf_bort_config = BertConfig.from_dict(hf_bort_config_json) - hf_bort_model = BertForMaskedLM(hf_bort_config) - hf_bort_model.eval() - - # Parameter mapping table (Gluonnlp to Transformers) - # * denotes layer index - # - # | Gluon Parameter | Transformers Parameter - # | -------------------------------------------------------------- | ---------------------- - # | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias` - # | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight` - # | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight` - # | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight` - # | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight` - # | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight` - # | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias` - # | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight` - - # Helper function to convert MXNET Arrays to PyTorch - def to_torch(mx_array) -> nn.Parameter: - return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) - - # Check param shapes and map new HF param back - def check_and_map_params(hf_param, gluon_param): - shape_hf = hf_param.shape - - gluon_param = to_torch(params[gluon_param]) - shape_gluon = gluon_param.shape - - assert ( - shape_hf == shape_gluon - ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers" - - return gluon_param - - hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight" - ) - hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight" - ) - hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta" - ) - hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma" - ) - - # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them) - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data - ) - - for i in range(hf_bort_config.num_hidden_layers): - layer: BertLayer = hf_bort_model.bert.encoder.layer[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.key.bias.data = check_and_map_params( - self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias" - ) - - self_attn.key.weight.data = check_and_map_params( - self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight" - ) - self_attn.query.bias.data = check_and_map_params( - self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias" - ) - self_attn.query.weight.data = check_and_map_params( - self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight" - ) - self_attn.value.bias.data = check_and_map_params( - self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias" - ) - self_attn.value.weight.data = check_and_map_params( - self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight" - ) - - # self attention output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.bias = check_and_map_params( - self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias" - ) - self_output.dense.weight = check_and_map_params( - self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight" - ) - self_output.LayerNorm.bias = check_and_map_params( - self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta" - ) - self_output.LayerNorm.weight = check_and_map_params( - self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma" - ) - - # intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.bias = check_and_map_params( - intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias" - ) - intermediate.dense.weight = check_and_map_params( - intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight" - ) - - # output - bert_output: BertOutput = layer.output - - bert_output.dense.bias = check_and_map_params( - bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias" - ) - bert_output.dense.weight = check_and_map_params( - bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight" - ) - bert_output.LayerNorm.bias = check_and_map_params( - bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta" - ) - bert_output.LayerNorm.weight = check_and_map_params( - bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma" - ) - - # Save space and energy 🎄 - hf_bort_model.half() - - # Compare output of both models - tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base") - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"] - - # Get gluon output - gluon_input_ids = mx.nd.array([input_ids]) - output_gluon = original_bort(inputs=gluon_input_ids, token_types=[]) - - # Get Transformer output (save and reload model again) - hf_bort_model.save_pretrained(pytorch_dump_folder_path) - hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path) - hf_bort_model.eval() - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt") - output_hf = hf_bort_model(**input_ids)[0] - - gluon_layer = output_gluon[0].asnumpy() - hf_layer = output_hf[0].detach().numpy() - - max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item() - success = np.allclose(gluon_layer, hf_layer, atol=1e-3) - - if success: - print("✔ Both model do output the same tensors") - else: - print("❌ Both model do **NOT** output the same tensors") - print("Absolute difference is:", max_absolute_diff) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py deleted file mode 100644 index 60e93efe7c60..000000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py +++ /dev/null @@ -1,319 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(): - config = DetaConfig( - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config() - - # load original state dict - if model_name == "deta-resnet-50": - filename = "adet_checkpoint0011.pth" - elif model_name == "deta-resnet-50-24-epochs": - filename = "adet_2x_checkpoint0023.pth" - else: - raise ValueError(f"Model name {model_name} not supported") - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename) - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy().keys(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - if model_name == "deta-resnet-50": - expected_logits = torch.tensor( - [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]] - ) - expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]) - elif model_name == "deta-resnet-50-24-epochs": - expected_logits = torch.tensor( - [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]] - ) - expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-resnet-50", - choices=["deta-resnet-50", "deta-resnet-50-24-epochs"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py deleted file mode 100644 index 392750fa67a1..000000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py +++ /dev/null @@ -1,326 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(model_name): - backbone_config = SwinConfig( - embed_dim=192, - depths=(2, 2, 18, 2), - num_heads=(6, 12, 24, 48), - window_size=12, - out_features=["stage2", "stage3", "stage4"], - ) - - config = DetaConfig( - backbone_config=backbone_config, - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - repo_id = "huggingface/label-files" - if "o365" in model_name: - num_labels = 366 - filename = "object365-id2label.json" - else: - num_labels = 91 - filename = "coco-detection-id2label.json" - - config.num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias")) - # stages - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - - if i < 3: - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias")) - - rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight")) - rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias")) - rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight")) - rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias")) - rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight")) - rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias")) - - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_swin_q_k_v(state_dict, backbone_config): - num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))] - for i in range(len(backbone_config.depths)): - dim = num_features[i] - for j in range(backbone_config.depths[i]): - # fmt: off - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim :, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :] - # fmt: on - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config(model_name) - - # load original state dict - if model_name == "deta-swin-large": - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth") - elif model_name == "deta-swin-large-o365": - checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth") - else: - raise ValueError(f"Model name {model_name} not supported") - - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - - # original state dict - for name, param in state_dict.items(): - print(name, param.shape) - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_swin_q_k_v(state_dict, config.backbone_config) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy().keys(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - print("Logits:", outputs.logits[0, :3, :3]) - print("Boxes:", outputs.pred_boxes[0, :3, :3]) - if model_name == "deta-swin-large": - expected_logits = torch.tensor( - [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]] - ) - expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]) - elif model_name == "deta-swin-large-o365": - expected_logits = torch.tensor( - [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]] - ) - expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]]) - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-swin-large", - choices=["deta-swin-large", "deta-swin-large-o365"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 7431cd6136a5..000000000000 --- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert EfficientFormer checkpoints from the original repository. - -URL: https://github.com/snap-research/EfficientFormer -""" - -import argparse -import re -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - EfficientFormerConfig, - EfficientFormerForImageClassificationWithTeacher, - EfficientFormerImageProcessor, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def rename_key(old_name, num_meta4D_last_stage): - new_name = old_name - - if "patch_embed" in old_name: - _, layer, param = old_name.split(".") - - if layer == "0": - new_name = old_name.replace("0", "convolution1") - elif layer == "1": - new_name = old_name.replace("1", "batchnorm_before") - elif layer == "3": - new_name = old_name.replace("3", "convolution2") - else: - new_name = old_name.replace("4", "batchnorm_after") - - if "network" in old_name and re.search(r"\d\.\d", old_name): - two_digit_num = r"\b\d{2}\b" - if bool(re.search(two_digit_num, old_name)): - match = re.search(r"\d\.\d\d.", old_name).group() - else: - match = re.search(r"\d\.\d.", old_name).group() - if int(match[0]) < 6: - trimmed_name = old_name.replace(match, "") - trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1]) - new_name = "intermediate_stages." + trimmed_name - else: - trimmed_name = old_name.replace(match, "") - if int(match[2]) < num_meta4D_last_stage: - trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2]) - else: - layer_index = str(int(match[2]) - num_meta4D_last_stage) - trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index) - if "norm1" in old_name: - trimmed_name = trimmed_name.replace("norm1", "layernorm1") - elif "norm2" in old_name: - trimmed_name = trimmed_name.replace("norm2", "layernorm2") - elif "fc1" in old_name: - trimmed_name = trimmed_name.replace("fc1", "linear_in") - elif "fc2" in old_name: - trimmed_name = trimmed_name.replace("fc2", "linear_out") - - new_name = "last_stage." + trimmed_name - - elif "network" in old_name and re.search(r".\d.", old_name): - new_name = old_name.replace("network", "intermediate_stages") - - if "fc" in new_name: - new_name = new_name.replace("fc", "convolution") - elif ("norm1" in new_name) and ("layernorm1" not in new_name): - new_name = new_name.replace("norm1", "batchnorm_before") - elif ("norm2" in new_name) and ("layernorm2" not in new_name): - new_name = new_name.replace("norm2", "batchnorm_after") - if "proj" in new_name: - new_name = new_name.replace("proj", "projection") - if "dist_head" in new_name: - new_name = new_name.replace("dist_head", "distillation_classifier") - elif "head" in new_name: - new_name = new_name.replace("head", "classifier") - elif "patch_embed" in new_name: - new_name = "efficientformer." + new_name - elif new_name == "norm.weight" or new_name == "norm.bias": - new_name = new_name.replace("norm", "layernorm") - new_name = "efficientformer." + new_name - else: - new_name = "efficientformer.encoder." + new_name - - return new_name - - -def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage): - for key in checkpoint.copy().keys(): - val = checkpoint.pop(key) - checkpoint[rename_key(key, num_meta4D_last_stage)] = val - - return checkpoint - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def convert_efficientformer_checkpoint( - checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool -): - orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - config = EfficientFormerConfig.from_json_file(efficientformer_config_file) - model = EfficientFormerForImageClassificationWithTeacher(config) - model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1]) - - num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1 - new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage) - - model.load_state_dict(new_state_dict) - model.eval() - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - # prepare image - image = prepare_img() - image_size = 256 - crop_size = 224 - processor = EfficientFormerImageProcessor( - size={"shortest_edge": image_size}, - crop_size={"height": crop_size, "width": crop_size}, - resample=pillow_resamplings["bicubic"], - ) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - # original processing pipeline - image_transforms = Compose( - [ - Resize(image_size, interpolation=pillow_resamplings["bicubic"]), - CenterCrop(crop_size), - ToTensor(), - Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - - assert torch.allclose(original_pixel_values, pixel_values) - - outputs = model(pixel_values) - logits = outputs.logits - - expected_shape = (1, 1000) - - if "l1" in model_name: - expected_logits = torch.Tensor( - [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l3" in model_name: - expected_logits = torch.Tensor( - [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l7" in model_name: - expected_logits = torch.Tensor( - [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878] - ) - assert logits.shape == expected_shape - else: - raise ValueError( - f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7" - ) - - # Save Checkpoints - Path(pytorch_dump_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_path) - print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}") - processor.save_pretrained(pytorch_dump_path) - print(f"Processor successfuly saved at {pytorch_dump_path}") - - if push_to_hub: - print("Pushing model to the hub...") - - model.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add model", - use_temp_dir=True, - ) - processor.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_model_path", - default=None, - type=str, - required=True, - help="Path to EfficientFormer pytorch checkpoint.", - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The json file for EfficientFormer model config.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - parser.add_argument( - "--no-push_to_hub", - dest="push_to_hub", - action="store_false", - help="Do not push model and image processor to the hub", - ) - parser.set_defaults(push_to_hub=True) - - args = parser.parse_args() - convert_efficientformer_checkpoint( - checkpoint_path=args.pytorch_model_path, - efficientformer_config_file=args.config_file, - pytorch_dump_path=args.pytorch_dump_path, - push_to_hub=args.push_to_hub, - ) diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py deleted file mode 100644 index a84d000d4439..000000000000 --- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,181 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model.""" - -import argparse -import json -import os -from collections import OrderedDict - -import numpy as np -import tensorflow as tf -import torch - - -def convert_tf_gptsan_to_pt(args): - parameter_file = os.path.join(args.tf_model_dir, "parameters.json") - params = json.loads(open(parameter_file).read()) - if not params: - raise ValueError( - f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file." - ) - if not args.output.endswith(".pt"): - args.output = args.output + ".pt" - new_state = OrderedDict() - with tf.device("/CPU:0"): - reader = tf.train.load_checkpoint(args.tf_model_dir) - shapes = reader.get_variable_to_shape_map() - for key_name in shapes.keys(): - vnp = reader.get_tensor(key_name).astype(np.float16) - if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"): - continue - if key_name.startswith("pasts/"): - if key_name.startswith("pasts/mlp"): - player = int(key_name[9]) - elif key_name.startswith("pasts/out"): - player = 8 - name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequencial with Tanh, so 2 at a time - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/moe"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/switch_gating/kernel"): - name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/softmlp/kernel"): - name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"): - nlayer = key_name[-9:-7] - for i in range(16): - name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer) - state = ( - vnp[i].transpose([1, 0]).copy() - ) # In Mesh-Tensorflow, it is one array, so it is divided - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/mlp"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/p1/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p1/bias"): - name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/bias"): - name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/ln"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.feed_forward.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.feed_forward.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/att"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/qkv/kernel"): - state = vnp.copy() # Compute same dimension as Mesh-tensorflow using einsum - state_q = state[:, 0, :, :] - state_k = state[:, 1, :, :] - state_v = state[:, 2, :, :] - state_q = ( - state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_k = ( - state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_v = ( - state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player - new_state[name] = torch.tensor(state_q) - name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player - new_state[name] = torch.tensor(state_k) - name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player - new_state[name] = torch.tensor(state_v) - elif key_name.endswith("/o/kernel"): - name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player - state = ( - vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy() - ) # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/an"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.self_attn.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.self_attn.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif ( - key_name.startswith("model/wte") - or key_name.startswith("model/wpe") - or key_name.startswith("model/ete") - ): - nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[ - key_name[-3:] - ] - name = "model.%s.weight" % nlayer - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - if key_name.startswith("model/wte"): - name = "lm_head.weight" - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/wob"): - name = "final_logits_bias" - state = vnp.copy() # same in embedded - state = state.reshape((1, -1)) - new_state[name] = torch.tensor(state) - elif key_name == "model/dense/kernel": - name = "model.last_project.weight" - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name == "model/dense_1/bias": - name = "model.last_project.bias" - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - torch.save(new_state, args.output) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model") - parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model") - args = parser.parse_args() - convert_tf_gptsan_to_pt(args) diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py deleted file mode 100644 index b56a25c57c70..000000000000 --- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Jukebox checkpoints""" - -import argparse -import json -import os -from pathlib import Path - -import requests -import torch - -from transformers import JukeboxConfig, JukeboxModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -PREFIX = "https://openaipublic.azureedge.net/jukebox/models/" -MODEL_MAPPING = { - "jukebox-1b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "1b_lyrics/prior_level_2.pth.tar", - ], - "jukebox-5b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "5b_lyrics/prior_level_2.pth.tar", - ], -} - - -def replace_key(key): - if key.endswith(".model.1.bias") and len(key.split(".")) > 10: - key = key.replace(".model.1.bias", ".conv1d_1.bias") - elif key.endswith(".model.1.weight") and len(key.split(".")) > 10: - key = key.replace(".model.1.weight", ".conv1d_1.weight") - elif key.endswith(".model.3.bias") and len(key.split(".")) > 10: - key = key.replace(".model.3.bias", ".conv1d_2.bias") - elif key.endswith(".model.3.weight") and len(key.split(".")) > 10: - key = key.replace(".model.3.weight", ".conv1d_2.weight") - - if "conditioner_blocks.0." in key: - key = key.replace("conditioner_blocks.0", "conditioner_blocks") - - if "prime_prior" in key: - key = key.replace("prime_prior", "encoder") - - if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key: - key = key.replace(".emb.", ".") - - if key.endswith("k"): # replace vqvae.X.k with vqvae.X.codebook - return key.replace(".k", ".codebook") - if "y_emb." in key: - return key.replace("y_emb.", "metadata_embedding.") - - if "x_emb.emb." in key: - key = key.replace("0.x_emb.emb", "embed_tokens") - - if "prime_state_ln" in key: - return key.replace("prime_state_ln", "encoder.final_layer_norm") - if ".ln" in key: - return key.replace(".ln", ".layer_norm") - if "_ln" in key: - return key.replace("_ln", "_layer_norm") - - if "prime_state_proj" in key: - return key.replace("prime_state_proj", "encoder.proj_in") - if "prime_x_out" in key: - return key.replace("prime_x_out", "encoder.lm_head") - if "prior.x_out" in key: - return key.replace("x_out", "fc_proj_out") - if "x_emb" in key: - return key.replace("x_emb", "embed_tokens") - - return key - - -def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping): - new_dict = {} - import re - - re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_encoder_block_resnet = re.compile( - r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_decoder_block_resnet = re.compile( - r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)") - re_prior_cond_resnet = re.compile( - r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)") - - for original_key, value in state_dict.items(): - # rename vqvae.encoder keys - if re_encoder_block_conv_in.fullmatch(original_key): - regex_match = re_encoder_block_conv_in.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}" - key = re_encoder_block_conv_in.sub(re_new_key, original_key) - - elif re_encoder_block_resnet.fullmatch(original_key): - regex_match = re_encoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_encoder_block_resnet.sub(re_new_key, original_key) - - elif re_encoder_block_proj_out.fullmatch(original_key): - regex_match = re_encoder_block_proj_out.match(original_key) - groups = regex_match.groups() - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}" - key = re_encoder_block_proj_out.sub(re_new_key, original_key) - - # rename vqvae.decoder keys - elif re_decoder_block_conv_out.fullmatch(original_key): - regex_match = re_decoder_block_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}" - key = re_decoder_block_conv_out.sub(re_new_key, original_key) - - elif re_decoder_block_resnet.fullmatch(original_key): - regex_match = re_decoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_decoder_block_resnet.sub(re_new_key, original_key) - - elif re_decoder_block_proj_in.fullmatch(original_key): - regex_match = re_decoder_block_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}" - key = re_decoder_block_proj_in.sub(re_new_key, original_key) - - # rename prior cond.model to upsampler.upsample_block and resnet - elif re_prior_cond_conv_out.fullmatch(original_key): - regex_match = re_prior_cond_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}" - key = re_prior_cond_conv_out.sub(re_new_key, original_key) - - elif re_prior_cond_resnet.fullmatch(original_key): - regex_match = re_prior_cond_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_prior_cond_resnet.sub(re_new_key, original_key) - - elif re_prior_cond_proj_in.fullmatch(original_key): - regex_match = re_prior_cond_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}" - key = re_prior_cond_proj_in.sub(re_new_key, original_key) - - # keep original key - else: - key = original_key - - key = replace_key(key) - - if f"{key_prefix}.{key}" not in model_state_dict or key is None: - print(f"failed converting {original_key} to {key}, does not match") - - # handle missmatched shape - elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape: - val = model_state_dict[f"{key_prefix}.{key}"] - print(f"{original_key}-> {key} : \nshape {val.shape} and { value.shape}, do not match") - key = original_key - - mapping[key] = original_key - new_dict[key] = value - - return new_dict - - -@torch.no_grad() -def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None): - """ - Copy/paste/tweak model's weights to our Jukebox structure. - """ - for file in MODEL_MAPPING[model_name]: - if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"): - r = requests.get(f"{PREFIX}{file}", allow_redirects=True) - os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True) - open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content) - - model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]] - - config = JukeboxConfig.from_pretrained(model_name) - model = JukeboxModel(config) - - weight_dict = [] - mapping = {} - for i, dict_name in enumerate(model_to_convert): - old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"] - - new_dic = {} - for k in old_dic.keys(): - if k.endswith(".b"): - new_dic[k.replace("b", "bias")] = old_dic[k] - elif k.endswith(".w"): - new_dic[k.replace("w", "weight")] = old_dic[k] - elif "level_2" not in dict_name and "cond.model." in k: - new_dic[k.replace(".blocks.", ".model.")] = old_dic[k] - else: - new_dic[k] = old_dic[k] - - key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}" - new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping) - weight_dict.append(new_dic) - - vqvae_state_dict = weight_dict.pop(0) - model.vqvae.load_state_dict(vqvae_state_dict) - for i in range(len(weight_dict)): - model.priors[i].load_state_dict(weight_dict[2 - i]) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile: - json.dump(mapping, txtfile) - - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - return weight_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="jukebox-5b-lyrics", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="jukebox-5b-lyrics-converted", - type=str, - help="Path to the output PyTorch model directory.", - ) - args = parser.parse_args() - convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 1f791dab2404..000000000000 --- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,292 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at -https://huggingface.co/mnaylor/mega-wikitext-103 - -Requirements: - - clone the Mega repo and install fairseq from there - 1. git clone https://github.com/facebookresearch/mega.git - 2. cd mega && pip install -e - - clone the pretrained weights for the original implementation from the hugging face repo - * use this location as the path for pretrained weights -""" - -import argparse - -# utilities to import the model weights and config file -import os -import pickle as pkl - -# PyTorch + new model classes -import torch -from torch import nn - -from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM - - -# import the EncoderLayer class used to pretrain -# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source -try: - from fairseq.modules.mega_layer import MegaEncoderLayer -except ImportError: - raise ImportError("You need to install the version of fairseq from the Mega repo!") - - -# define the wrapper classes used to train the MLM (see colab notebook below) -# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing -# MegaLM outputs hidden states -class MegaLM(nn.Module): - "The base class for our Mega encoder - given input IDs, embed text and return encoder output" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega_args = mega_args - self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim) - self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)]) - self.depth = depth - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch - tensors, and returns a tensor of size (batch, n_classes) containing classification logits - - Other options: - - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which - aligns with the HF tokenizer behavior) - - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0, - which aligns with HF tokenizer) - """ - - # Mega expects embeddings to be (time, batch, embedding size), but - # Hugging Face returns tokens as (batch, time) - if batch_first: - input_ids = input_ids.T - - # to make things more confusing, Mega expects the attention mask to - # be (batch, time), but with values of 0 (normal token) and 1 (ignore token) - # which is the opposite of what HF returns - if ignore_mask_value == 0: - attention_mask = 1 - attention_mask - - # get token embeddings from IDs - embeds = self.embedding_layer(input_ids) - - # pass through the Mega layers - # input is (time, batch, encoder dim) and output is the same - for encoder in self.encoders: - embeds = encoder(embeds, attention_mask) - - # return according to the shape specified - if batch_first: - # (T, B, H) --> (B, T, H) - return torch.transpose(embeds, 0, 1) - else: - return embeds - - -# renamed from MegaForMaskedLM to avoid confusion with new module -class OriginalMegaForMaskedLM(nn.Module): - "A wrapper class for doing masked language modeling with Mega" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega = MegaLM(mega_args, depth, vocab_size) - self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size) - self.dropout = nn.Dropout(p=0.1) - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary - entry. - - If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch - size, Sequence length, Vocab size); otherwise (S, B, V) - """ - encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value) - return self.mlm_head(self.dropout(encoder_output)) - - -# code to convert the checkpoint located in the user-specified location -def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer): - with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f: - mega_original_args = pkl.load(f) - - # load the original encoder - original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval() - - # load its weights - print( - "Original Mega encoder:", - original_mlm.mega.load_state_dict( - torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu") - ), - ) - print( - "Original Mega MLM layer:", - original_mlm.mlm_head.load_state_dict( - torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu") - ), - ) - - # create a new config from the old one - hf_config = MegaConfig( - num_hidden_layers=mega_original_args["depth"], - vocab_size=mega_original_args["vocab_size"], - hidden_size=mega_original_args["mega_args"].encoder_embed_dim, - shared_representation_size=mega_original_args["mega_args"].encoder_z_dim, - intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim, - ema_projection_size=mega_original_args["mega_args"].encoder_n_dim, - dropout_prob=mega_original_args["mega_args"].dropout, - attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout, - hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout, - activation=mega_original_args["mega_args"].activation_fn, - attention_activation=mega_original_args["mega_args"].attention_activation_fn, - bidirectional=mega_original_args["mega_args"].bidirectional, - use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0, - chunk_size=mega_original_args["mega_args"].encoder_chunk_size, - truncation=mega_original_args["mega_args"].truncation_length, - normalization_type=mega_original_args["mega_args"].normalization_type, - normalize_before_mega=True, - norm_affine=True, - use_feature_dropout=mega_original_args["mega_args"].feature_dropout, - relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias, - max_positions=mega_original_args["mega_args"].max_source_positions, - nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim, - normalize_before_ffn=mega_original_args["mega_args"].normalize_before, - # new arguments added for HF implementation - nffn_activation_dropout_prob=0.0, - add_token_type_embeddings=False, - add_lm_hidden_dense_layer=False, - ) - - hf_mlm = MegaForMaskedLM(hf_config).eval() - - # the originl checkpoint just uses nn.Embedding for the word embeddings - # we use a wrapper module for embeddings to add support for positional embeddings - hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight - - # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face - # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained, - # also renaming previously confusing parameter names - original_state_dict = original_mlm.mega.encoders.state_dict() - updated_keys = {} - for module_name in original_state_dict.keys(): - new_module_name = None - # have to handle gamma, beta, and alpha differently due to their use - # in multiple modules within the original repository; - # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights - # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here - if "beta" in module_name: - # EMA sub-layers were always called "move" in the original repo - if "move.beta" in module_name: - new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix") - elif "mega_layer.beta" in module_name: - new_module_name = module_name.replace("beta", "qk_bias") - else: - new_module_name = module_name.replace("beta", "b_param") - # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights - elif "gamma" in module_name: - if "move.gamma" in module_name: - new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix") - elif "mega_layer.gamma" in module_name: - new_module_name = module_name.replace("gamma", "qk_weight") - else: - new_module_name = module_name.replace("gamma", "g_param") - # alpha is used in EMA and positional bias; renaming to improve readability - elif "move.alpha" in module_name: - new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor") - # delta is only used in EMA; renaming to improve readability - elif "move.delta" in module_name: - new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor") - # omega is only used in EMA; renaming to improve readability - elif "omega" in module_name: - new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight") - - if new_module_name: - updated_keys[module_name] = new_module_name - - if len(updated_keys) != 0: - print(f"Renaming these keys: {updated_keys.keys()}") - else: - print("No need to rename state dict entries") - for old, new in updated_keys.items(): - original_state_dict[new] = original_state_dict.pop(old) - - # now attempt to load the state dictionary with updated names - # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style - print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict)) - - # load the MLM head weights directly - print( - "HF Mega MLM layer:", - hf_mlm.mlm_head.load_state_dict( - torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu") - ), - ) - - # test on a randomly generated input sequence - input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256)) - input_mask = torch.ones_like(input_ids) - # mask a few tokens to make sure masking is applied appropriately :) - input_mask[:, -10:] = 0 - - # run forward passes - original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0) - hf_output = hf_mlm(input_ids, input_mask)[0] - - # print shapes and diff - print(f"original output {original_output.shape}") - print(f"hf output {hf_output.shape}") - print(f"max diff: {(original_output - hf_output).max()}") # 0.0 - success = torch.allclose(original_output, hf_output, atol=1e-3) - - if success: - print("Yay!") - hf_mlm.save_pretrained(output_path) - else: - raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}") - - if includes_tokenizer: - print("Transferring tokenizer") - tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path) - tokenizer.save_pretrained(output_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--pretrained_checkpoint_path", - default=None, - type=str, - required=True, - help="Point to the directory containing your model weights using the official Mega repo", - ) - - parser.add_argument( - "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version" - ) - - parser.add_argument( - "--includes_tokenizer", - action="store_true", - help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo", - ) - - args = parser.parse_args() - - convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer) diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index da7f7806671d..000000000000 --- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""TrajectoryTransformer pytorch checkpoint conversion""" - -import torch -import trajectory.utils as utils - -from transformers import TrajectoryTransformerModel - - -class Parser(utils.Parser): - dataset: str = "halfcheetah-medium-expert-v2" - config: str = "config.offline" - - -def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device): - """Converting Sequential blocks to ModuleList""" - - gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device) - trajectory_transformer = TrajectoryTransformerModel(gpt.config) - - trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict()) - trajectory_transformer.pos_emb = gpt.pos_emb - trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict()) - trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict()) - trajectory_transformer.head.load_state_dict(gpt.head.state_dict()) - - for i, block in enumerate(gpt.blocks): - trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict()) - trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict()) - trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict()) - - trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict()) - trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict()) - trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict()) - trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict()) - - torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin") - - -if __name__ == "__main__": - """ - To run this script you will need to install the original repository to run the original model. You can find it - here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the - original pytorch checkpoints. - - Run with the command: - - ```sh - >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset - ... --gpt_loadpath - ``` - """ - - args = Parser().parse_args("plan") - convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch( - args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device - ) diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 2c7b687c4d98..000000000000 --- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Transformer XL checkpoint and datasets.""" - -import argparse -import os -import pickle -import sys - -import torch - -from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl -from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils -from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - -# We do this to be able to load python 2 datasets pickles -# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 -data_utils.Vocab = data_utils.TransfoXLTokenizer -data_utils.Corpus = data_utils.TransfoXLCorpus -sys.modules["data_utils"] = data_utils -sys.modules["vocabulary"] = data_utils - - -def convert_transfo_xl_checkpoint_to_pytorch( - tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file -): - if transfo_xl_dataset_file: - # Convert a pre-processed corpus (see original TensorFlow repo) - with open(transfo_xl_dataset_file, "rb") as fp: - corpus = pickle.load(fp, encoding="latin1") - # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) - pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] - print(f"Save vocabulary to {pytorch_vocab_dump_path}") - corpus_vocab_dict = corpus.vocab.__dict__ - torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) - - corpus_dict_no_vocab = corpus.__dict__ - corpus_dict_no_vocab.pop("vocab", None) - pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME - print(f"Save dataset to {pytorch_dataset_dump_path}") - torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) - - if tf_checkpoint_path: - # Convert a pre-trained TensorFlow model - config_path = os.path.abspath(transfo_xl_config_file) - tf_path = os.path.abspath(tf_checkpoint_path) - - print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.") - # Initialise PyTorch model - if transfo_xl_config_file == "": - config = TransfoXLConfig() - else: - config = TransfoXLConfig.from_json_file(transfo_xl_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = TransfoXLLMHeadModel(config) - - model = load_tf_weights_in_transfo_xl(model, config, tf_path) - # Save pytorch-model - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) - print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to store the PyTorch model or dataset/vocab.", - ) - parser.add_argument( - "--tf_checkpoint_path", - default="", - type=str, - help="An optional path to a TensorFlow checkpoint path to be converted.", - ) - parser.add_argument( - "--transfo_xl_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--transfo_xl_dataset_file", - default="", - type=str, - help="An optional dataset file to be converted in a vocabulary.\n" - "Given the files are in the pickle format, please be wary of passing it files you trust.", - ) - args = parser.parse_args() - convert_transfo_xl_checkpoint_to_pytorch( - args.tf_checkpoint_path, - args.transfo_xl_config_file, - args.pytorch_dump_folder_path, - args.transfo_xl_dataset_file, - ) diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py deleted file mode 100644 index 51466e77bae0..000000000000 --- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py +++ /dev/null @@ -1,290 +0,0 @@ -# coding=utf-8 -# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert VAN checkpoints from the original repository. - -URL: https://github.com/Visual-Attention-Network/VAN-Classification""" - -import argparse -import json -import sys -from dataclasses import dataclass, field -from functools import partial -from pathlib import Path -from typing import List - -import torch -import torch.nn as nn -from huggingface_hub import cached_download, hf_hub_download -from torch import Tensor - -from transformers import AutoImageProcessor, VanConfig, VanForImageClassification -from transformers.models.deprecated.van.modeling_van import VanLayerScaling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -@dataclass -class Tracker: - module: nn.Module - traced: List[nn.Module] = field(default_factory=list) - handles: list = field(default_factory=list) - - def _forward_hook(self, m, inputs: Tensor, outputs: Tensor): - has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) - if has_not_submodules: - if not isinstance(m, VanLayerScaling): - self.traced.append(m) - - def __call__(self, x: Tensor): - for m in self.module.modules(): - self.handles.append(m.register_forward_hook(self._forward_hook)) - self.module(x) - [x.remove() for x in self.handles] - return self - - @property - def parametrized(self): - # check the len of the state_dict keys to see if we have learnable params - return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced)) - - -@dataclass -class ModuleTransfer: - src: nn.Module - dest: nn.Module - verbose: int = 0 - src_skip: List = field(default_factory=list) - dest_skip: List = field(default_factory=list) - - def __call__(self, x: Tensor): - """ - Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the - hood we tracked all the operations in both modules. - """ - dest_traced = Tracker(self.dest)(x).parametrized - src_traced = Tracker(self.src)(x).parametrized - - src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced)) - dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced)) - - if len(dest_traced) != len(src_traced): - raise Exception( - f"Numbers of operations are different. Source module has {len(src_traced)} operations while" - f" destination module has {len(dest_traced)}." - ) - - for dest_m, src_m in zip(dest_traced, src_traced): - dest_m.load_state_dict(src_m.state_dict()) - if self.verbose == 1: - print(f"Transfered from={src_m} to={dest_m}") - - -def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module: - # nn.Parameter cannot be tracked by the Tracker, thus we need to manually convert them - from_state_dict = from_model.state_dict() - our_state_dict = our_model.state_dict() - config = our_model.config - all_keys = [] - for stage_idx in range(len(config.hidden_sizes)): - for block_id in range(config.depths[stage_idx]): - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight" - - all_keys.append((from_key, to_key)) - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight" - - all_keys.append((from_key, to_key)) - - for from_key, to_key in all_keys: - our_state_dict[to_key] = from_state_dict.pop(from_key) - - our_model.load_state_dict(our_state_dict) - return our_model - - -def convert_weight_and_push( - name: str, - config: VanConfig, - checkpoint: str, - from_model: nn.Module, - save_directory: Path, - push_to_hub: bool = True, -): - print(f"Downloading weights for {name}...") - checkpoint_path = cached_download(checkpoint) - print(f"Converting {name}...") - from_state_dict = torch.load(checkpoint_path)["state_dict"] - from_model.load_state_dict(from_state_dict) - from_model.eval() - with torch.no_grad(): - our_model = VanForImageClassification(config).eval() - module_transfer = ModuleTransfer(src=from_model, dest=our_model) - x = torch.randn((1, 3, 224, 224)) - module_transfer(x) - our_model = copy_parameters(from_model, our_model) - - if not torch.allclose(from_model(x), our_model(x).logits): - raise ValueError("The model logits don't match the original one.") - - checkpoint_name = name - print(checkpoint_name) - - if push_to_hub: - our_model.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add model", - use_temp_dir=True, - ) - - # we can use the convnext one - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add image processor", - use_temp_dir=True, - ) - - print(f"Pushed {checkpoint_name}") - - -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): - filename = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) - - names_to_config = { - "van-tiny": ImageNetPreTrainedConfig( - hidden_sizes=[32, 64, 160, 256], - depths=[3, 3, 5, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-small": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[2, 2, 4, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-base": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 3, 12, 3], - mlp_ratios=[8, 8, 4, 4], - ), - "van-large": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 5, 27, 3], - mlp_ratios=[8, 8, 4, 4], - ), - } - - names_to_original_models = { - "van-tiny": van_tiny, - "van-small": van_small, - "van-base": van_base, - "van-large": van_large, - } - - names_to_original_checkpoints = { - "van-tiny": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar" - ), - "van-small": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar" - ), - "van-base": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar" - ), - "van-large": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar" - ), - } - - if model_name: - convert_weight_and_push( - model_name, - names_to_config[model_name], - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - else: - for model_name, config in names_to_config.items(): - convert_weight_and_push( - model_name, - config, - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default=None, - type=str, - help=( - "The name of the model you wish to convert, it must be one of the supported resnet* architecture," - " currently: van-tiny/small/base/large. If `None`, all of them will the converted." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=Path, - required=True, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--van_dir", - required=True, - type=Path, - help=( - "A path to VAN's original implementation directory. You can download from here:" - " https://github.com/Visual-Attention-Network/VAN-Classification" - ), - ) - parser.add_argument( - "--push_to_hub", - default=True, - type=bool, - required=False, - help="If True, push model and image processor to the hub.", - ) - - args = parser.parse_args() - pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path - pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True) - van_dir = args.van_dir - # append the path to the parents to maskformer dir - sys.path.append(str(van_dir.parent)) - from van.models.van import van_base, van_large, van_small, van_tiny - - convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py deleted file mode 100644 index 1d717d74c961..000000000000 --- a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py +++ /dev/null @@ -1,282 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ViT hybrid checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import ( - BitConfig, - ViTHybridConfig, - ViTHybridForImageClassification, - ViTHybridImageProcessor, - ViTHybridModel, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - - # fmt: off - # stem: - rename_keys.append(("cls_token", "vit.embeddings.cls_token")) - rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings")) - - rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias")) - - # backbone - rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias")) - - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias")) - - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias")) - - # transformer encoder - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias")) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "vit" from all keys that start with "vit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys] - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "vit.layernorm.weight"), - ("norm.bias", "vit.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - # fmt: on - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "vit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.weight", "head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our ViT structure. - """ - - # define default ViT hybrid configuration - backbone_config = BitConfig( - global_padding="same", - layer_type="bottleneck", - depths=(3, 4, 9), - out_features=["stage3"], - embedding_dynamic_padding=True, - ) - config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000) - base_model = False - - # load original model from timm - timm_model = timm.create_model(vit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - if base_model: - remove_classification_head_(state_dict) - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load HuggingFace model - if vit_name[-5:] == "in21k": - model = ViTHybridModel(config).eval() - else: - model = ViTHybridForImageClassification(config).eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = ViTHybridImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Predicted class:", logits.argmax(-1).item()) - if base_model: - timm_pooled_output = timm_model.forward_features(pixel_values) - assert timm_pooled_output.shape == outputs.pooler_output.shape - assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3) - else: - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {vit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor to the hub {vit_name}") - model.push_to_hub(f"ybelkada/{vit_name}") - processor.push_to_hub(f"ybelkada/{vit_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--vit_name", - default="vit_base_r50_s16_384", - type=str, - help="Name of the hybrid ViT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - - args = parser.parse_args() - convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py deleted file mode 100644 index 5c6da13ae885..000000000000 --- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Depth Anything checkpoints from the original repository. URL: -https://github.com/LiheYoung/Depth-Anything""" - -import argparse -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 64 - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 128 - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 256 - neck_hidden_sizes = [256, 512, 1024, 1024] - else: - raise NotImplementedError(f"Model not supported: {model_name}") - - if "metric" in model_name: - depth_estimation_type = "metric" - max_depth = 20 if "indoor" in model_name else 80 - else: - depth_estimation_type = "relative" - max_depth = None - - config = DepthAnythingConfig( - reassemble_hidden_size=backbone_config.hidden_size, - patch_size=backbone_config.patch_size, - backbone_config=backbone_config, - fusion_hidden_size=fusion_hidden_size, - neck_hidden_sizes=neck_hidden_sizes, - depth_estimation_type=depth_estimation_type, - max_depth=max_depth, - ) - - return config - - -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transfomer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - - # Head - rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias")) - - # activation postprocessing (readout projections + resize blocks) - # Depth Anything does not use CLS token => readout_projects not required - - for i in range(4): - rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight")) - rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_checkpoint = { - "depth-anything-small": "pytorch_model.bin", - "depth-anything-base": "pytorch_model.bin", - "depth-anything-large": "pytorch_model.bin", - "depth-anything-v2-small": "depth_anything_v2_vits.pth", - "depth-anything-v2-base": "depth_anything_v2_vitb.pth", - "depth-anything-v2-large": "depth_anything_v2_vitl.pth", - "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth", - "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth", - "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth", - "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth", - "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth", - "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth", - # v2-giant pending -} - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration - config = get_dpt_config(model_name) - - model_name_to_repo = { - "depth-anything-small": "LiheYoung/depth_anything_vits14", - "depth-anything-base": "LiheYoung/depth_anything_vitb14", - "depth-anything-large": "LiheYoung/depth_anything_vitl14", - "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small", - "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base", - "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large", - "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small", - "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base", - "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large", - "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small", - "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base", - "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large", - } - - # load original state_dict - repo_id = model_name_to_repo[model_name] - filename = name_to_checkpoint[model_name] - filepath = hf_hub_download( - repo_id=repo_id, - filename=f"{filename}", - ) - - state_dict = torch.load(filepath, map_location="cpu") - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DepthAnythingForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - processor = DPTImageProcessor( - do_resize=True, - size={"height": 518, "width": 518}, - ensure_multiple_of=14, - keep_aspect_ratio=True, - do_rescale=True, - do_normalize=True, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - ) - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - pixel_values = processor(image, return_tensors="pt").pixel_values - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - expected_shape = torch.Size([1, 518, 686]) - if model_name == "depth-anything-small": - expected_slice = torch.tensor( - [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]], - ) - elif model_name == "depth-anything-base": - expected_slice = torch.tensor( - [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]], - ) - elif model_name == "depth-anything-large": - expected_slice = torch.tensor( - [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]] - ) - elif model_name == "depth-anything-v2-small": - expected_slice = torch.tensor( - [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]] - ) - elif model_name == "depth-anything-v2-base": - expected_slice = torch.tensor( - [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]] - ) - elif model_name == "depth-anything-v2-large": - expected_slice = torch.tensor( - [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]] - ) - elif model_name == "depth-anything-v2-metric-indoor-small": - expected_slice = torch.tensor( - [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]] - ) - elif model_name == "depth-anything-v2-metric-indoor-base": - expected_slice = torch.tensor( - [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]] - ) - elif model_name == "depth-anything-v2-metric-indoor-large": - expected_slice = torch.tensor( - [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-small": - expected_slice = torch.tensor( - [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-base": - expected_slice = torch.tensor( - [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-large": - expected_slice = torch.tensor( - [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]] - ) - else: - raise ValueError("Not supported") - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"{model_name.title()}-hf") - processor.push_to_hub(repo_id=f"{model_name.title()}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="depth-anything-small", - type=str, - choices=name_to_checkpoint.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_false", - required=False, - help="Whether to verify the logits after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index ba985145014c..000000000000 --- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,277 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with timm backbone.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config = DetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = DetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - # verify our conversion - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py deleted file mode 100644 index 6ba6a0e2920a..000000000000 --- a/src/transformers/models/detr/convert_detr_to_pytorch.py +++ /dev/null @@ -1,385 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with native (Transformers) backbone.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_detr_config(model_name): - # initialize config - if "resnet-50" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") - elif "resnet-101" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101") - else: - raise ValueError("Model name should include either resnet50 or resnet101") - - config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config) - - # set label attributes - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config, is_panoptic - - -def create_rename_keys(config): - # here we list all keys to be renamed (original name on the left, our name on the right) - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # fmt: on - - for i in range(config.encoder_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - ( - f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", - f"encoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", - f"decoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads - rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] - ) - - return rename_keys - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config, is_panoptic = get_detr_config(model_name) - - # load original model from torch hub - model_name_to_original_name = { - "detr-resnet-50": "detr_resnet50", - "detr-resnet-101": "detr_resnet101", - } - logger.info(f"Converting model {model_name}...") - detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in create_rename_keys(config): - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - # verify our conversion on an image - format = "coco_panoptic" if is_panoptic else "coco_detection" - processor = DetrImageProcessor(format=format) - - encoding = processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Upload model and image processor to the hub - logger.info("Uploading PyTorch model and image processor to the hub...") - model.push_to_hub(f"nielsr/{model_name}") - processor.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="detr-resnet-50", - type=str, - choices=["detr-resnet-50", "detr-resnet-101"], - help="Name of the DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index fbf34012924b..000000000000 --- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers.utils import WEIGHTS_NAME - - -DIALOGPT_MODELS = ["small", "medium", "large"] - -OLD_KEY = "lm_head.decoder.weight" -NEW_KEY = "lm_head.weight" - - -def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): - d = torch.load(checkpoint_path) - d[NEW_KEY] = d.pop(OLD_KEY) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--dialogpt_path", default=".", type=str) - args = parser.parse_args() - for MODEL in DIALOGPT_MODELS: - checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") - pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" - convert_dialogpt_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - ) diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py deleted file mode 100644 index d716191b2fcb..000000000000 --- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_config(model_name, image_classifier=False): - config = Dinov2Config(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DINOv2 structure. - """ - - # define default Dinov2 configuration - image_classifier = "1layer" in model_name - config = get_dinov2_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2ForImageClassification(config).eval() - model.dinov2.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", - "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", - "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", - "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2Model(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14": "dinov2-small", - "dinov2_vitb14": "dinov2-base", - "dinov2_vitl14": "dinov2-large", - "dinov2_vitg14": "dinov2-giant", - "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", - "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", - "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", - "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"facebook/{name}") - processor.push_to_hub(f"facebook/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vitb14", - type=str, - choices=[ - "dinov2_vits14", - "dinov2_vitb14", - "dinov2_vitl14", - "dinov2_vitg14", - "dinov2_vits14_1layer", - "dinov2_vitb14_1layer", - "dinov2_vitl14_1layer", - "dinov2_vitg14_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py deleted file mode 100644 index 0ff2697f7466..000000000000 --- a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py +++ /dev/null @@ -1,291 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 with Registers checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import ( - BitImageProcessor, - Dinov2WithRegistersConfig, - Dinov2WithRegistersForImageClassification, - Dinov2WithRegistersModel, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_with_registers_config(model_name, image_classifier=False): - config = Dinov2WithRegistersConfig(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("register_tokens", "embeddings.register_tokens")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Dinov2WithRegisters structure. - """ - - # define default Dinov2WithRegisters configuration - image_classifier = "1layer" in model_name - config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2WithRegistersForImageClassification(config).eval() - model.dinov2_with_registers.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth", - "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth", - "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth", - "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2WithRegistersModel(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14_reg": "dinov2-with-registers-small", - "dinov2_vitb14_reg": "dinov2-with-registers-base", - "dinov2_vitl14_reg": "dinov2-with-registers-large", - "dinov2_vitg14_reg": "dinov2-with-registers-giant", - "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer", - "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer", - "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer", - "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"nielsr/{name}") - processor.push_to_hub(f"nielsr/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vits14_reg", - type=str, - choices=[ - "dinov2_vits14_reg", - "dinov2_vitb14_reg", - "dinov2_vitl14_reg", - "dinov2_vitg14_reg", - "dinov2_vits14_reg_1layer", - "dinov2_vitb14_reg_1layer", - "dinov2_vitl14_reg_1layer", - "dinov2_vitg14_reg_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py deleted file mode 100644 index 40c5b22e3b9a..000000000000 --- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py +++ /dev/null @@ -1,230 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"), - ] - ) - - if has_lm_head: - # mask token + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - has_lm_head = False if "rvlcdip" in checkpoint_url else True - config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head) - - # size of the architecture - if "large" in checkpoint_url or "dit-l" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # labels - if "rvlcdip" in checkpoint_url: - config.num_labels = 16 - repo_id = "huggingface/label-files" - filename = "rvlcdip-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head) - - # load HuggingFace model - model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192] - assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected" - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - if has_lm_head: - model_name = "dit-base" if "base" in checkpoint_url else "dit-large" - else: - model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip" - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - args = parser.parse_args() - convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py deleted file mode 100644 index f6f14f6d08e3..000000000000 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut""" - -import argparse - -import torch -from datasets import load_dataset -from donut import DonutModel - -from transformers import ( - DonutImageProcessor, - DonutProcessor, - DonutSwinConfig, - DonutSwinModel, - MBartConfig, - MBartForCausalLM, - VisionEncoderDecoderModel, - XLMRobertaTokenizerFast, -) - - -def get_configs(model): - original_config = model.config - - encoder_config = DonutSwinConfig( - image_size=original_config.input_size, - patch_size=4, - depths=original_config.encoder_layer, - num_heads=[4, 8, 16, 32], - window_size=original_config.window_size, - embed_dim=128, - ) - decoder_config = MBartConfig( - is_decoder=True, - is_encoder_decoder=False, - add_cross_attention=True, - decoder_layers=original_config.decoder_layer, - max_position_embeddings=original_config.max_position_embeddings, - vocab_size=len( - model.decoder.tokenizer - ), # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json) - scale_embedding=True, - add_final_layer_norm=True, - ) - - return encoder_config, decoder_config - - -def rename_key(name): - if "encoder.model" in name: - name = name.replace("encoder.model", "encoder") - if "decoder.model" in name: - name = name.replace("decoder.model", "decoder") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if name.startswith("encoder"): - if "layers" in name: - name = "encoder." + name - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name and "mask" not in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - - if name == "encoder.norm.weight": - name = "encoder.layernorm.weight" - if name == "encoder.norm.bias": - name = "encoder.layernorm.bias" - - return name - - -def convert_state_dict(orig_state_dict, model): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - block_num = int(key_split[5]) - dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size - - if "weight" in key: - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" - ] = val[:dim, :] - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = ( - val[dim : dim * 2] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) - elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]: - # HuggingFace implementation doesn't use attn_mask buffer - # and model doesn't use final LayerNorms for the encoder - pass - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = DonutModel.from_pretrained(model_name).eval() - - # load HuggingFace model - encoder_config, decoder_config = get_configs(original_model) - encoder = DonutSwinModel(encoder_config) - decoder = MBartForCausalLM(decoder_config) - model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results on scanned document - dataset = load_dataset("hf-internal-testing/example-documents") # no-script - image = dataset["test"][0]["image"].convert("RGB") - - tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) - image_processor = DonutImageProcessor( - do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1] - ) - processor = DonutProcessor(image_processor, tokenizer) - pixel_values = processor(image, return_tensors="pt").pixel_values - - if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": - task_prompt = "{user_input}" - question = "When is the coffee break?" - task_prompt = task_prompt.replace("{user_input}", question) - elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip": - task_prompt = "" - elif model_name in [ - "naver-clova-ix/donut-base-finetuned-cord-v1", - "naver-clova-ix/donut-base-finetuned-cord-v1-2560", - ]: - task_prompt = "" - elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": - task_prompt = "s_cord-v2>" - elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket": - task_prompt = "" - elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]: - # use a random prompt - task_prompt = "hello world" - else: - raise ValueError("Model name not supported") - prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[ - "input_ids" - ] - - original_patch_embed = original_model.encoder.model.patch_embed(pixel_values) - patch_embeddings, _ = model.encoder.embeddings(pixel_values) - assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3) - - # verify encoder hidden states - original_last_hidden_state = original_model.encoder(pixel_values) - last_hidden_state = model.encoder(pixel_values).last_hidden_state - assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2) - - # verify decoder hidden states - original_logits = original_model(pixel_values, prompt_tensors, None).logits - logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits - assert torch.allclose(original_logits, logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="naver-clova-ix/donut-base-finetuned-docvqa", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub.", - ) - - args = parser.parse_args() - convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py deleted file mode 100644 index c11345d1eb4e..000000000000 --- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import collections -from pathlib import Path - -import torch -from torch.serialization import default_restore_location - -from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader - - -CheckpointState = collections.namedtuple( - "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"] -) - - -def load_states_from_checkpoint(model_file: str) -> CheckpointState: - print(f"Reading saved model from {model_file}") - state_dict = torch.load(model_file, map_location=lambda s, l: default_restore_location(s, "cpu")) - return CheckpointState(**state_dict) - - -class DPRState: - def __init__(self, src_file: Path): - self.src_file = src_file - - def load_dpr_model(self): - raise NotImplementedError - - @staticmethod - def from_type(comp_type: str, *args, **kwargs) -> "DPRState": - if comp_type.startswith("c"): - return DPRContextEncoderState(*args, **kwargs) - if comp_type.startswith("q"): - return DPRQuestionEncoderState(*args, **kwargs) - if comp_type.startswith("r"): - return DPRReaderState(*args, **kwargs) - else: - raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.") - - -class DPRContextEncoderState(DPRState): - def load_dpr_model(self): - model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.ctx_encoder, "ctx_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRQuestionEncoderState(DPRState): - def load_dpr_model(self): - model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.question_encoder, "question_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRReaderState(DPRState): - def load_dpr_model(self): - model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR reader from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = { - "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids - } - for key, value in saved_state.model_dict.items(): - if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"): - key = "encoder.bert_model." + key[len("encoder.") :] - state_dict[key] = value - model.span_predictor.load_state_dict(state_dict) - return model - - -def convert(comp_type: str, src_file: Path, dest_dir: Path): - dest_dir = Path(dest_dir) - dest_dir.mkdir(exist_ok=True) - - dpr_state = DPRState.from_type(comp_type, src_file=src_file) - model = dpr_state.load_dpr_model() - model.save_pretrained(dest_dir) - model.from_pretrained(dest_dir) # sanity check - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - ) - parser.add_argument( - "--src", - type=str, - help=( - "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo" - " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the" - " 'retriever' checkpoints." - ), - ) - parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.") - args = parser.parse_args() - - src_file = Path(args.src) - dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest - dest_dir = Path(dest_dir) - assert src_file.exists() - assert ( - args.type is not None - ), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - convert(args.type, src_file, dest_dir) diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py deleted file mode 100644 index 367aff7f90e1..000000000000 --- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py +++ /dev/null @@ -1,383 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 + DPT checkpoints from the original repository. URL: -https://github.com/facebookresearch/dinov2/tree/main""" - -import argparse -import itertools -import math -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision import transforms - -from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - # equivalent to stage 3, stage 6, stage 9, stage 12 - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [128, 256, 512, 1024] - elif "giant" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [192, 384, 768, 1536] - else: - raise NotImplementedError("To do") - - config = DPTConfig( - backbone_config=backbone_config, - neck_hidden_sizes=neck_hidden_sizes, - use_bias_in_fusion_residual=False, - add_projection=True, - ) - - return config - - -# here we list all DPT keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_dpt(config): - rename_keys = [] - - # fmt: off - # activation postprocessing (projections, readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - if i != 2: - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # fusion layers - for i in range(4): - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias")) - if i != 0: - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight")) - - # neck convolutions - for i in range(4): - rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight")) - rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias")) - - for i in range(0, 5, 2): - rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias")) - # fmt: on - - return rename_keys - - -# here we list all backbone keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_backbone(config): - rename_keys = [] - - # fmt: off - # patch embedding layer - rename_keys.append(("cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transfomer encoder - for i in range(config.backbone_config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - # MLP - if config.backbone_config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - # fmt: on - - rename_keys.append(("norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("norm.bias", "backbone.layernorm.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - hidden_size = config.backbone_config.hidden_size - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_url = { - "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth", - "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth", - "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth", - "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth", - "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth", - "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth", - "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth", - "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth", -} - - -def get_original_pixel_values(image): - class CenterPadding: - def __init__(self, multiple): - super().__init__() - self.multiple = multiple - - def _get_pad(self, size): - new_size = math.ceil(size / self.multiple) * self.multiple - pad_size = new_size - size - pad_size_left = pad_size // 2 - pad_size_right = pad_size - pad_size_left - return pad_size_left, pad_size_right - - def __call__(self, img): - pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1])) - output = torch.nn.functional.pad(img, pads) - return output - - def __repr__(self): - return self.__class__.__name__ + "()" - - def make_depth_transform() -> transforms.Compose: - return transforms.Compose( - [ - transforms.ToTensor(), - lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255 - transforms.Normalize( - mean=(123.675, 116.28, 103.53), - std=(58.395, 57.12, 57.375), - ), - CenterPadding(multiple=14), - ] - ) - - transform = make_depth_transform() - original_pixel_values = transform(image).unsqueeze(0) - - return original_pixel_values - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config = get_dpt_config(model_name) - - # load original DPT state_dict from URL - print("URL:", checkpoint_url) - dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"] - # rename keys - rename_keys = create_rename_keys_dpt(config) - for src, dest in rename_keys: - rename_key(dpt_state_dict, src, dest) - - # load original backbone state_dict from URL - if "small" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14") - elif "base" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14") - elif "large" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14") - elif "giant" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14") - else: - raise NotImplementedError("To do") - original_model.eval() - backbone_state_dict = original_model.state_dict() - - # rename keys - rename_keys = create_rename_keys_backbone(config) - for src, dest in rename_keys: - rename_key(backbone_state_dict, src, dest) - - # read in qkv matrices - read_in_q_k_v(backbone_state_dict, config) - - for key, val in backbone_state_dict.copy().items(): - val = backbone_state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - backbone_state_dict[key] = val - - # merge state_dicts - state_dict = {**backbone_state_dict, **dpt_state_dict} - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [ - "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight", - "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight", - ] - model.eval() - - # Verify image processor - processor = DPTImageProcessor( - do_resize=False, - do_rescale=False, - do_pad=True, - size_divisor=14, - do_normalize=True, - image_mean=(123.675, 116.28, 103.53), - image_std=(58.395, 57.12, 57.375), - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values.float() - original_pixel_values = get_original_pixel_values(image) - - assert torch.allclose(pixel_values, original_pixel_values) - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - if model_name == "dpt-dinov2-small-nyu": - expected_shape = torch.Size([1, 576, 736]) - expected_slice = torch.tensor( - [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]] - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"facebook/{model_name}") - processor.push_to_hub(repo_id=f"facebook/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-dinov2-small-nyu", - type=str, - choices=name_to_url.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - required=False, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py deleted file mode 100644 index 3a576d772f57..000000000000 --- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - hidden_size = 768 - num_hidden_layers = 12 - num_attention_heads = 12 - intermediate_size = 3072 - out_features = ["stage3", "stage6", "stage9", "stage12"] # beit-base-384 uses [2, 5, 8, 11] - - if "large" in model_name: - hidden_size = 1024 - num_hidden_layers = 24 - num_attention_heads = 16 - intermediate_size = 4096 - out_features = ["stage6", "stage12", "stage18", "stage24"] # beit-large-512 uses [5, 11, 17, 23] - - if "512" in model_name: - image_size = 512 - elif "384" in model_name: - image_size = 384 - else: - raise ValueError("Model not supported") - - backbone_config = BeitConfig( - image_size=image_size, - num_hidden_layers=num_hidden_layers, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_attention_heads=num_attention_heads, - use_relative_position_bias=True, - reshape_hidden_states=False, - out_features=out_features, - ) - - neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768] - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transfomer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1")) - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index")) - - # activation postprocessing (readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt", - "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt", - "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [] - # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"] - model.eval() - - # Check outputs on an image - # We set `keep_aspect_ratio=False` as our current BEiT does not support arbitrary window sizes - processor = DPTImageProcessor( - size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32 - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values - - print("First values of pixel values:", pixel_values[0, 0, :3, :3]) - print("Mean of pixel values:", pixel_values.mean().item()) - print("Shape of pixel values:", pixel_values.shape) - - import requests - from PIL import Image - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - # TODO there's still a small difference with the original logits - if model_name == "dpt-beit-large-512": - # OK, checked - expected_shape = torch.Size([1, 512, 512]) - expected_slice = torch.tensor( - [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]] - ) - elif model_name == "dpt-beit-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]], - ) - elif model_name == "dpt-beit-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"nielsr/{model_name}") - processor.push_to_hub(repo_id=f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-beit-large-512", - type=str, - choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py deleted file mode 100644 index 16e4d71212b5..000000000000 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ /dev/null @@ -1,315 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig(embedding_type="hybrid") - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "nyu" in checkpoint_url or "midas" in checkpoint_url: - config.hidden_size = 768 - config.reassemble_factors = [1, 1, 1, 0.5] - config.neck_hidden_sizes = [256, 512, 768, 768] - config.num_labels = 150 - config.patch_size = 16 - expected_shape = (1, 384, 384) - config.use_batch_norm_in_fusion_residual = False - config.readout_type = "project" - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - config.hidden_size = 768 - config.reassemble_stage = [1, 1, 1, 0.5] - config.num_labels = 150 - config.patch_size = 16 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name and "backbone" not in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name and "backbone" not in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - if "backbone" in name: - name = name.replace("backbone", "backbone.bit.encoder") - - if ".." in name: - name = name.replace("..", ".") - - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "convolution" in name and "backbone" in name: - name = name.replace("convolution", "conv") - if "layer" in name and "backbone" in name: - name = name.replace("layer", "layers") - if "backbone.bit.encoder.bit" in name: - name = name.replace("backbone.bit.encoder.bit", "backbone.bit") - if "embedder.conv" in name: - name = name.replace("embedder.conv", "embedder.convolution") - if "backbone.bit.encoder.stem.norm" in name: - name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm") - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - state_dict = torch.load(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - if show_prediction: - prediction = ( - torch.nn.functional.interpolate( - outputs.unsqueeze(1), - size=(image.size[1], image.size[0]), - mode="bicubic", - align_corners=False, - ) - .squeeze() - .cpu() - .numpy() - ) - - Image.fromarray((prediction / prediction.max()) * 255).show() - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("ybelkada/dpt-hybrid-midas") - image_processor.push_to_hub("ybelkada/dpt-hybrid-midas") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - help="Name of the model, in case you're pushing to the hub.", - ) - parser.add_argument( - "--show_prediction", - action="store_true", - ) - - args = parser.parse_args() - convert_dpt_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction - ) diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py deleted file mode 100644 index 0feebe72d474..000000000000 --- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py +++ /dev/null @@ -1,321 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTImageProcessor, Swinv2Config -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "tiny" in model_name: - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - window_size = 16 - # note: for Swinv2-tiny authors used the window_size = 16 variant - # as seen here: https://github.com/isl-org/MiDaS/blob/bdc4ed64c095e026dc0a2f17cabb14d58263decb/midas/backbones/swin2.py#L26 - pretrained_window_sizes = (0, 0, 0, 0) - elif "base" in model_name: - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - elif "large" in model_name: - embed_dim = 192 - depths = (2, 2, 18, 2) - num_heads = (6, 12, 24, 48) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - - if "384" in model_name: - image_size = 384 - elif "256" in model_name: - image_size = 256 - else: - raise ValueError("Model not supported, to do") - - backbone_config = Swinv2Config( - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - window_size=window_size, - pretrained_window_sizes=pretrained_window_sizes, - num_heads=num_heads, - out_features=["stage1", "stage2", "stage3", "stage4"], - ) - - if model_name == "dpt-swinv2-tiny-256": - neck_hidden_sizes = [96, 192, 384, 768] - elif model_name == "dpt-swinv2-base-384": - neck_hidden_sizes = [128, 256, 512, 1024] - elif model_name == "dpt-swinv2-large-384": - neck_hidden_sizes = [192, 384, 768, 1536] - - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight")) - rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias")) - - # transformer encoder - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.logit_scale", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.logit_scale")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.2.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.q_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.v_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - - # downsample parameters - if i in [0,1,2]: - rename_keys.append((f"pretrained.model.layers.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias")) - - # note: non-Transformer backbones like Swinv2, LeViT et al don't require activation postprocessing (readout projections + resize blocks) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, model): - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim:, : - ] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt", - "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt", - "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - - # load HuggingFace model - model = DPTForDepthEstimation(config) - - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config, model) - - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - model.eval() - - # Check outputs on an image - processor = DPTImageProcessor(size={"height": image_size, "width": image_size}) - - image = prepare_img() - processor(image, return_tensors="pt") - - if verify_logits: - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if model_name == "dpt-swinv2-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1998.5575, 1997.3887, 2009.2981], - [1952.8607, 1979.6488, 2001.0854], - [1953.7697, 1961.7711, 1968.8904], - ], - ) - elif model_name == "dpt-swinv2-tiny-256": - # OK, checked - expected_shape = torch.Size([1, 256, 256]) - expected_slice = torch.tensor( - [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]], - ) - elif model_name == "dpt-swinv2-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1203.7206, 1200.1495, 1197.8234], - [1196.2484, 1183.5033, 1186.4640], - [1178.8131, 1182.3260, 1174.3975], - ], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"Intel/{model_name}") - processor.push_to_hub(repo_id=f"Intel/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-swinv2-base-384", - type=str, - choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - help="Whether to verify logits after conversion.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py deleted file mode 100644 index 489da9acd19c..000000000000 --- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig() - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - - config.num_labels = 150 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "patch_embeddings") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - # Assert logits - expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]) - if "ade" in checkpoint_url: - expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]]) - assert outputs.shape == torch.Size(expected_shape) - assert ( - torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4) - if "ade" in checkpoint_url - else torch.allclose(outputs[0, :3, :3], expected_slice) - ) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model to hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - required=False, - help="Name of the model, in case you're pushing to the hub.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py deleted file mode 100644 index e9988524aca0..000000000000 --- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py +++ /dev/null @@ -1,339 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EfficientNet checkpoints from the original repository. - -URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py""" - -import argparse -import json -import os - -import numpy as np -import PIL -import requests -import tensorflow.keras.applications.efficientnet as efficientnet -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from tensorflow.keras.preprocessing import image - -from transformers import ( - EfficientNetConfig, - EfficientNetForImageClassification, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -model_classes = { - "b0": efficientnet.EfficientNetB0, - "b1": efficientnet.EfficientNetB1, - "b2": efficientnet.EfficientNetB2, - "b3": efficientnet.EfficientNetB3, - "b4": efficientnet.EfficientNetB4, - "b5": efficientnet.EfficientNetB5, - "b6": efficientnet.EfficientNetB6, - "b7": efficientnet.EfficientNetB7, -} - -CONFIG_MAP = { - "b0": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.0, - "image_size": 224, - "dropout_rate": 0.2, - "dw_padding": [], - }, - "b1": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.1, - "image_size": 240, - "dropout_rate": 0.2, - "dw_padding": [16], - }, - "b2": { - "hidden_dim": 1408, - "width_coef": 1.1, - "depth_coef": 1.2, - "image_size": 260, - "dropout_rate": 0.3, - "dw_padding": [5, 8, 16], - }, - "b3": { - "hidden_dim": 1536, - "width_coef": 1.2, - "depth_coef": 1.4, - "image_size": 300, - "dropout_rate": 0.3, - "dw_padding": [5, 18], - }, - "b4": { - "hidden_dim": 1792, - "width_coef": 1.4, - "depth_coef": 1.8, - "image_size": 380, - "dropout_rate": 0.4, - "dw_padding": [6], - }, - "b5": { - "hidden_dim": 2048, - "width_coef": 1.6, - "depth_coef": 2.2, - "image_size": 456, - "dropout_rate": 0.4, - "dw_padding": [13, 27], - }, - "b6": { - "hidden_dim": 2304, - "width_coef": 1.8, - "depth_coef": 2.6, - "image_size": 528, - "dropout_rate": 0.5, - "dw_padding": [31], - }, - "b7": { - "hidden_dim": 2560, - "width_coef": 2.0, - "depth_coef": 3.1, - "image_size": 600, - "dropout_rate": 0.5, - "dw_padding": [18], - }, -} - - -def get_efficientnet_config(model_name): - config = EfficientNetConfig() - config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"] - config.width_coefficient = CONFIG_MAP[model_name]["width_coef"] - config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"] - config.image_size = CONFIG_MAP[model_name]["image_size"] - config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"] - config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"] - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_image_processor(model_name): - size = CONFIG_MAP[model_name]["image_size"] - preprocessor = EfficientNetImageProcessor( - size={"height": size, "width": size}, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.47853944, 0.4732864, 0.47434163], - do_center_crop=False, - ) - return preprocessor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = sorted(set(block_names)) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight")) - rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight")) - rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias")) - rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean")) - rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var")) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "efficientnet." + item[1] - - key_mapping["predictions/kernel:0"] = "classifier.weight" - key_mapping["predictions/bias:0"] = "classifier.bias" - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - for key, value in tf_params.items(): - if "normalization" in key: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - assert hf_params[hf_key].shape == new_hf_value.shape - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our EfficientNet structure. - """ - # Load original model - original_model = model_classes[model_name]( - include_top=True, - weights="imagenet", - input_tensor=None, - input_shape=None, - pooling=None, - classes=1000, - classifier_activation="softmax", - ) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_efficientnet_config(model_name) - hf_model = EfficientNetForImageClassification(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize preprocessor and preprocess input image - preprocessor = convert_image_processor(model_name) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - hf_logits = outputs.logits.detach().numpy() - - # Original model inference - original_model.trainable = False - image_size = CONFIG_MAP[model_name]["image_size"] - img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST) - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - original_logits = original_model.predict(x) - - # Check whether original and HF model outputs match -> np.allclose - assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same." - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print(f"Pushing converted {model_name} to the hub...") - model_name = f"efficientnet-{model_name}" - preprocessor.push_to_hub(model_name) - hf_model.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="b0", - type=str, - help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index b0abc30cd758..000000000000 --- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ELECTRA checkpoint.""" - -import argparse - -import torch - -from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): - # Initialise PyTorch model - config = ElectraConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - - if discriminator_or_generator == "discriminator": - model = ElectraForPreTraining(config) - elif discriminator_or_generator == "generator": - model = ElectraForMaskedLM(config) - else: - raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") - - # Load weights from tf checkpoint - load_tf_weights_in_electra( - model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator - ) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--discriminator_or_generator", - default=None, - type=str, - required=True, - help=( - "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " - "'generator'." - ), - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator - ) diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py deleted file mode 100644 index 4db97bd68836..000000000000 --- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py +++ /dev/null @@ -1,365 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EnCodec checkpoints.""" - -import argparse - -import torch - -from transformers import ( - EncodecConfig, - EncodecFeatureExtractor, - EncodecModel, - logging, -) - - -# checkpoints downloaded from: -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th -# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.encodec") - -MAPPING_QUANTIZER = { - "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited", - "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size", - "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed", - "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg", -} -MAPPING_ENCODER = { - "encoder.model.0.conv.conv": "encoder.layers.0.conv", - "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv", - "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv", - "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv", - "encoder.model.3.conv.conv": "encoder.layers.3.conv", - "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv", - "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv", - "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv", - "encoder.model.6.conv.conv": "encoder.layers.6.conv", - "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv", - "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv", - "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv", - "encoder.model.9.conv.conv": "encoder.layers.9.conv", - "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv", - "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv", - "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv", - "encoder.model.12.conv.conv": "encoder.layers.12.conv", - "encoder.model.13.lstm": "encoder.layers.13.lstm", - "encoder.model.15.conv.conv": "encoder.layers.15.conv", -} -MAPPING_ENCODER_48K = { - "encoder.model.0.conv.norm": "encoder.layers.0.norm", - "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm", - "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm", - "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm", - "encoder.model.3.conv.norm": "encoder.layers.3.norm", - "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm", - "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm", - "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm", - "encoder.model.6.conv.norm": "encoder.layers.6.norm", - "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm", - "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm", - "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm", - "encoder.model.9.conv.norm": "encoder.layers.9.norm", - "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm", - "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm", - "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm", - "encoder.model.12.conv.norm": "encoder.layers.12.norm", - "encoder.model.15.conv.norm": "encoder.layers.15.norm", -} -MAPPING_DECODER = { - "decoder.model.0.conv.conv": "decoder.layers.0.conv", - "decoder.model.1.lstm": "decoder.layers.1.lstm", - "decoder.model.3.convtr.convtr": "decoder.layers.3.conv", - "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv", - "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv", - "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv", - "decoder.model.6.convtr.convtr": "decoder.layers.6.conv", - "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv", - "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv", - "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv", - "decoder.model.9.convtr.convtr": "decoder.layers.9.conv", - "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv", - "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv", - "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv", - "decoder.model.12.convtr.convtr": "decoder.layers.12.conv", - "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv", - "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv", - "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv", - "decoder.model.15.conv.conv": "decoder.layers.15.conv", -} -MAPPING_DECODER_48K = { - "decoder.model.0.conv.norm": "decoder.layers.0.norm", - "decoder.model.3.convtr.norm": "decoder.layers.3.norm", - "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm", - "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm", - "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm", - "decoder.model.6.convtr.norm": "decoder.layers.6.norm", - "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm", - "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm", - "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm", - "decoder.model.9.convtr.norm": "decoder.layers.9.norm", - "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm", - "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm", - "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm", - "decoder.model.12.convtr.norm": "decoder.layers.12.norm", - "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm", - "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm", - "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm", - "decoder.model.15.conv.norm": "decoder.layers.15.norm", -} -MAPPING_24K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_DECODER, -} -MAPPING_48K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_ENCODER_48K, - **MAPPING_DECODER, - **MAPPING_DECODER_48K, -} -TOP_LEVEL_KEYS = [] -IGNORE_KEYS = [] - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - if hf_shape != value.shape: - raise ValueError( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - elif weight_type == "weight_ih_l0": - hf_pointer.weight_ih_l0.data = value - elif weight_type == "weight_hh_l0": - hf_pointer.weight_hh_l0.data = value - elif weight_type == "bias_ih_l0": - hf_pointer.bias_ih_l0.data = value - elif weight_type == "bias_hh_l0": - hf_pointer.bias_hh_l0.data = value - elif weight_type == "weight_ih_l1": - hf_pointer.weight_ih_l1.data = value - elif weight_type == "weight_hh_l1": - hf_pointer.weight_hh_l1.data = value - elif weight_type == "bias_ih_l1": - hf_pointer.bias_ih_l1.data = value - elif weight_type == "bias_hh_l1": - hf_pointer.bias_hh_l1.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.") - - -def should_ignore(name, ignore_keys): - for key in ignore_keys: - if key.endswith(".*"): - if name.startswith(key[:-1]): - return True - elif ".*." in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - return True - elif key in name: - return True - return False - - -def recursively_load_weights(orig_dict, hf_model, model_name): - unused_weights = [] - - if model_name in ["encodec_24khz", "encodec_32khz"]: - MAPPING = MAPPING_24K - elif model_name == "encodec_48khz": - MAPPING = MAPPING_48K - else: - raise ValueError(f"Unsupported model: {model_name}") - - for name, value in orig_dict.items(): - if should_ignore(name, IGNORE_KEYS): - logger.info(f"{name} was ignored") - continue - - is_used = False - for key, mapped_key in MAPPING.items(): - if "*" in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - key = suffix - - if key in name: - # HACK otherwise .embed gets initialized with .embed_avg too - if key.endswith("embed") and name.endswith("embed_avg"): - continue - - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight_ih_l0" in name: - weight_type = "weight_ih_l0" - elif "weight_hh_l0" in name: - weight_type = "weight_hh_l0" - elif "bias_ih_l0" in name: - weight_type = "bias_ih_l0" - elif "bias_hh_l0" in name: - weight_type = "bias_hh_l0" - elif "weight_ih_l1" in name: - weight_type = "weight_ih_l1" - elif "weight_hh_l1" in name: - weight_type = "weight_hh_l1" - elif "bias_ih_l1" in name: - weight_type = "bias_ih_l1" - elif "bias_hh_l1" in name: - weight_type = "bias_hh_l1" - elif "bias" in name: - weight_type = "bias" - elif "weight" in name: - weight_type = "weight" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -@torch.no_grad() -def convert_checkpoint( - model_name, - checkpoint_path, - pytorch_dump_folder_path, - config_path=None, - repo_id=None, -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = EncodecConfig.from_pretrained(config_path) - else: - config = EncodecConfig() - - if model_name == "encodec_24khz": - pass # config is already correct - elif model_name == "encodec_32khz": - config.upsampling_ratios = [8, 5, 4, 4] - config.target_bandwidths = [2.2] - config.num_filters = 64 - config.sampling_rate = 32_000 - config.codebook_size = 2048 - config.use_causal_conv = False - config.normalize = False - config.use_conv_shortcut = False - elif model_name == "encodec_48khz": - config.upsampling_ratios = [8, 5, 4, 2] - config.target_bandwidths = [3.0, 6.0, 12.0, 24.0] - config.sampling_rate = 48_000 - config.audio_channels = 2 - config.use_causal_conv = False - config.norm_type = "time_group_norm" - config.normalize = True - config.chunk_length_s = 1.0 - config.overlap = 0.01 - else: - raise ValueError(f"Unknown model name: {model_name}") - - model = EncodecModel(config) - - feature_extractor = EncodecFeatureExtractor( - feature_size=config.audio_channels, - sampling_rate=config.sampling_rate, - chunk_length_s=config.chunk_length_s, - overlap=config.overlap, - ) - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - original_checkpoint = torch.load(checkpoint_path) - if "best_state" in original_checkpoint: - # we might have a training state saved, in which case discard the yaml results and just retain the weights - original_checkpoint = original_checkpoint["best_state"] - recursively_load_weights(original_checkpoint, model, model_name) - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - feature_extractor.push_to_hub(repo_id) - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model", - default="encodec_24khz", - type=str, - help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.", - ) - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_checkpoint( - args.model, - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py deleted file mode 100644 index 020dd4e57663..000000000000 --- a/src/transformers/models/esm/convert_esm.py +++ /dev/null @@ -1,399 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ESM checkpoint.""" - -import argparse -import pathlib -from pathlib import Path -from tempfile import TemporaryDirectory - -import esm as esm_module -import torch -from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences -from esm.esmfold.v1.pretrained import esmfold_v1 - -from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig -from transformers.models.esm.modeling_esm import ( - EsmForMaskedLM, - EsmForSequenceClassification, - EsmIntermediate, - EsmLayer, - EsmOutput, - EsmSelfAttention, - EsmSelfOutput, -) -from transformers.models.esm.modeling_esmfold import EsmForProteinFolding -from transformers.models.esm.tokenization_esm import EsmTokenizer -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_DATA = [ - ( - "protein1", - "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA", - ), - ("protein2", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"), - ("protein3", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLAGG"), - ("protein4", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLA"), -] - -MODEL_MAPPING = { - "esm1b_t33_650M_UR50S": esm_module.pretrained.esm1b_t33_650M_UR50S, - "esm1v_t33_650M_UR90S_1": esm_module.pretrained.esm1v_t33_650M_UR90S_1, - "esm1v_t33_650M_UR90S_2": esm_module.pretrained.esm1v_t33_650M_UR90S_2, - "esm1v_t33_650M_UR90S_3": esm_module.pretrained.esm1v_t33_650M_UR90S_3, - "esm1v_t33_650M_UR90S_4": esm_module.pretrained.esm1v_t33_650M_UR90S_4, - "esm1v_t33_650M_UR90S_5": esm_module.pretrained.esm1v_t33_650M_UR90S_5, - "esm2_t48_15B_UR50D": esm_module.pretrained.esm2_t48_15B_UR50D, - "esm2_t36_3B_UR50D": esm_module.pretrained.esm2_t36_3B_UR50D, - "esm2_t33_650M_UR50D": esm_module.pretrained.esm2_t33_650M_UR50D, - "esm2_t30_150M_UR50D": esm_module.pretrained.esm2_t30_150M_UR50D, - "esm2_t12_35M_UR50D": esm_module.pretrained.esm2_t12_35M_UR50D, - "esm2_t6_8M_UR50D": esm_module.pretrained.esm2_t6_8M_UR50D, - "esmfold_v1": esmfold_v1, -} - -restypes = list("ARNDCQEGHILKMFPSTWYV") - -restypes_with_x = restypes + ["X"] -restypes_with_extras = restypes_with_x + ["", "", "", "", ""] - - -def get_esmfold_tokenizer(): - with TemporaryDirectory() as tempdir: - vocab = "\n".join(restypes_with_extras) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - hf_tokenizer.pad_token_id = 0 # Overlaps with 'A' but that seems to be what they want - return hf_tokenizer - - -def transfer_and_check_weights(original_module, our_module): - status = our_module.load_state_dict(original_module.state_dict()) - if status.missing_keys: - raise ValueError(f"Missing keys: {status.missing_keys}") - if status.unexpected_keys: - raise ValueError(f"Unexpected keys: {status.unexpected_keys}") - - -def convert_esm_checkpoint_to_pytorch( - model: str, pytorch_dump_folder_path: str, classification_head: bool, push_to_repo: str, auth_token: str -): - """ - Copy/paste/tweak esm's weights to our BERT structure. - """ - if model.startswith("esmfold"): - esm = MODEL_MAPPING[model]() - else: - esm, alphabet = MODEL_MAPPING[model]() - esm.eval() # disable dropout - - if model.startswith("esmfold"): - embed_dim = esm.esm.embed_dim - num_layers = esm.esm.num_layers - num_attention_heads = esm.esm.attention_heads - intermediate_size = 4 * embed_dim - token_dropout = esm.esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = True - esmfold_config = EsmFoldConfig() - for key, val in esm.cfg.items(): - if hasattr(esmfold_config, key) and key != "trunk": - setattr(esmfold_config, key, val) - for key, val in esm.cfg.trunk.items(): - if hasattr(esmfold_config.trunk, key) and key != "structure_module": - setattr(esmfold_config.trunk, key, val) - for key, val in esm.cfg.trunk.structure_module.items(): - if hasattr(esmfold_config.trunk.structure_module, key): - setattr(esmfold_config.trunk.structure_module, key, val) - elif hasattr(esm, "args"): - # Indicates an ESM-1b or ESM-1v model - embed_dim = esm.args.embed_dim - num_layers = esm.args.layers - num_attention_heads = esm.args.attention_heads - intermediate_size = esm.args.ffn_embed_dim - token_dropout = esm.args.token_dropout - emb_layer_norm_before = True if esm.emb_layer_norm_before else False - position_embedding_type = "absolute" - is_folding_model = False - esmfold_config = None - else: - # Indicates an ESM-2 model - embed_dim = esm.embed_dim - num_layers = esm.num_layers - num_attention_heads = esm.attention_heads - intermediate_size = 4 * embed_dim # This is hardcoded in ESM-2 - token_dropout = esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = False - esmfold_config = None - - if is_folding_model: - alphabet = esm.esm.alphabet - vocab_list = tuple(alphabet.all_toks) - mask_token_id = alphabet.mask_idx - pad_token_id = alphabet.padding_idx - - if is_folding_model: - original_esm_model = esm.esm - else: - original_esm_model = esm - - config = EsmConfig( - vocab_size=original_esm_model.embed_tokens.num_embeddings, - mask_token_id=mask_token_id, - hidden_size=embed_dim, - num_hidden_layers=num_layers, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - max_position_embeddings=1026, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.0, - pad_token_id=pad_token_id, - emb_layer_norm_before=emb_layer_norm_before, - token_dropout=token_dropout, - position_embedding_type=position_embedding_type, - is_folding_model=is_folding_model, - esmfold_config=esmfold_config, - vocab_list=vocab_list, - ) - if classification_head: - config.num_labels = esm.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our ESM config:", config) - - if model.startswith("esmfold"): - model_class = EsmForProteinFolding - elif classification_head: - model_class = EsmForSequenceClassification - else: - model_class = EsmForMaskedLM - model = model_class(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.esm.embeddings.word_embeddings.weight = original_esm_model.embed_tokens.weight - if position_embedding_type == "absolute": - model.esm.embeddings.position_embeddings.weight = original_esm_model.embed_positions.weight - - if config.emb_layer_norm_before: - model.esm.embeddings.layer_norm.weight = original_esm_model.emb_layer_norm_before.weight - model.esm.embeddings.layer_norm.bias = original_esm_model.emb_layer_norm_before.bias - - model.esm.encoder.emb_layer_norm_after.weight = original_esm_model.emb_layer_norm_after.weight - model.esm.encoder.emb_layer_norm_after.bias = original_esm_model.emb_layer_norm_after.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: EsmLayer = model.esm.encoder.layer[i] - # esm_layer: TransformerSentenceEncoderLayer = original_esm_model.layers[i] - esm_layer = original_esm_model.layers[i] - - # self attention - self_attn: EsmSelfAttention = layer.attention.self - assert ( - esm_layer.self_attn.k_proj.weight.data.shape - == esm_layer.self_attn.q_proj.weight.data.shape - == esm_layer.self_attn.v_proj.weight.data.shape - == torch.Size((config.hidden_size, config.hidden_size)) - ) - - self_attn.query.weight.data = esm_layer.self_attn.q_proj.weight - self_attn.query.bias.data = esm_layer.self_attn.q_proj.bias - self_attn.key.weight.data = esm_layer.self_attn.k_proj.weight - self_attn.key.bias.data = esm_layer.self_attn.k_proj.bias - self_attn.value.weight.data = esm_layer.self_attn.v_proj.weight - self_attn.value.bias.data = esm_layer.self_attn.v_proj.bias - - if getattr(esm_layer.self_attn, "rot_emb", None) is not None: - # Matt: Although inv_freq is not a trainable weight, it is computed at model init and cached. - # During the training of ESM-2 the model was converted to float16 precision, which also converts - # the inv_freq tensor, and the loss of precision remains even if the model is loaded later as float32. - # If we recompute inv_freq without this loss of precision then we will get subtly different rotary - # embeddings, which are enough to cause significant discrepancies in model outputs. To avoid this, - # we make sure the new model copies the data from the old inv_freq. - self_attn.rotary_embeddings.inv_freq.data = esm_layer.self_attn.rot_emb.inv_freq - - # LayerNorm changes for pre-activation - layer.attention.LayerNorm.weight = esm_layer.self_attn_layer_norm.weight - layer.attention.LayerNorm.bias = esm_layer.self_attn_layer_norm.bias - layer.LayerNorm.weight = esm_layer.final_layer_norm.weight - layer.LayerNorm.bias = esm_layer.final_layer_norm.bias - - # self-attention output - self_output: EsmSelfOutput = layer.attention.output - assert self_output.dense.weight.shape == esm_layer.self_attn.out_proj.weight.shape - self_output.dense.weight = esm_layer.self_attn.out_proj.weight - self_output.dense.bias = esm_layer.self_attn.out_proj.bias - - # intermediate - intermediate: EsmIntermediate = layer.intermediate - assert intermediate.dense.weight.shape == esm_layer.fc1.weight.shape - intermediate.dense.weight = esm_layer.fc1.weight - intermediate.dense.bias = esm_layer.fc1.bias - - # output - bert_output: EsmOutput = layer.output - assert bert_output.dense.weight.shape == esm_layer.fc2.weight.shape - bert_output.dense.weight = esm_layer.fc2.weight - bert_output.dense.bias = esm_layer.fc2.bias - # end of layer - - if is_folding_model: - model.esm_s_combine.data = esm.esm_s_combine.data - model.af2_to_esm.data = esm.af2_to_esm.data - transfer_and_check_weights(esm.embedding, model.embedding) - transfer_and_check_weights(esm.esm_s_mlp, model.esm_s_mlp) - transfer_and_check_weights(esm.trunk, model.trunk) - transfer_and_check_weights(esm.distogram_head, model.distogram_head) - transfer_and_check_weights(esm.ptm_head, model.ptm_head) - transfer_and_check_weights(esm.lm_head, model.lm_head) - transfer_and_check_weights(esm.lddt_head, model.lddt_head) - - elif classification_head: - model.classifier.dense.weight = esm.esm.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = esm.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = esm.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = esm.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = esm.lm_head.dense.weight - model.lm_head.dense.bias = esm.lm_head.dense.bias - model.lm_head.layer_norm.weight = esm.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = esm.lm_head.layer_norm.bias - model.lm_head.decoder.weight = esm.lm_head.weight - model.lm_head.bias = esm.lm_head.bias - - # Contact prediction head - transfer_and_check_weights(esm.contact_head, model.esm.contact_head) - - # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4) - if is_folding_model: - # Folding models aren't trained on masked inputs and don't like mask tokens. - sample_data = SAMPLE_DATA[:2] - else: - sample_data = SAMPLE_DATA - - if is_folding_model: - hf_tokenizer = get_esmfold_tokenizer() - hf_tokens = hf_tokenizer( - [row[1] for row in sample_data], return_tensors="pt", padding=True, add_special_tokens=False - ) - esmfold_aas, esmfold_mask, _, _, _ = esmfold_encode_sequences([row[1] for row in sample_data]) - success = torch.all(hf_tokens["input_ids"] == esmfold_aas) and torch.all( - hf_tokens["attention_mask"] == esmfold_mask - ) - else: - # Let's check that we get the same results. - batch_converter = alphabet.get_batch_converter() - batch_labels, batch_strs, batch_tokens = batch_converter(sample_data) - # Prepare tokenizer and make sure it matches - with TemporaryDirectory() as tempdir: - vocab = "\n".join(alphabet.all_toks) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - - hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True) - success = torch.all(hf_tokens["input_ids"] == batch_tokens) - - print("Do both models tokenizers output the same tokens?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Tokenization does not match!") - - with torch.no_grad(): - if is_folding_model: - # Let's test the model in parts - # ESMFold always converts the ESM stem to float16, which requires float16 ops - # that don't exist on CPU. Therefore, to test it we need to run it on GPU. However, - # ESMFold is what we in the community call a "big boy" and so we desperately avoid putting both the - # original and the converted model on the GPU at the same time. - their_output = esm.cuda().infer([row[1] for row in sample_data]) - our_output = model.cuda()( - input_ids=hf_tokens["input_ids"].cuda(), attention_mask=hf_tokens["attention_mask"].cuda() - ) - else: - our_output = model(**hf_tokens, output_hidden_states=True) - our_output = our_output["logits"] - if classification_head: - their_output = esm.model.classification_heads["mnli"](esm.extract_features(batch_tokens)) - else: - their_output = esm(hf_tokens["input_ids"], repr_layers=list(range(999))) - their_output = their_output["logits"] - - if is_folding_model: - max_absolute_diff = torch.max(torch.abs(our_output["positions"] - their_output["positions"])).item() - success = torch.allclose(our_output["positions"], their_output["positions"], atol=1e-5) - else: - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - - if not success: - raise Exception("Something went wRoNg") - - if not is_folding_model: - # Let's check contact prediction too - our_output = model.predict_contacts(hf_tokens["input_ids"], hf_tokens["attention_mask"]) - their_output = esm.predict_contacts(hf_tokens["input_ids"]) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print("Contact prediction testing:") - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - del esm # Free up some memory before continuing - - print(f"Saving tokenizer to {pytorch_dump_folder_path}") - hf_tokenizer.save_pretrained(pytorch_dump_folder_path) - - if push_to_repo: - model.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - parser.add_argument("--model", default=None, type=str, required=True, help="Name of model to convert.") - parser.add_argument("--push_to_repo", type=str, help="Repo to upload to (including username!).") - parser.add_argument("--auth_token", type=str, help="HuggingFace auth token.") - args = parser.parse_args() - convert_esm_checkpoint_to_pytorch( - args.model, args.pytorch_dump_folder_path, args.classification_head, args.push_to_repo, args.auth_token - ) diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py deleted file mode 100644 index 0da817c3ffa7..000000000000 --- a/src/transformers/models/falcon/convert_custom_code_checkpoint.py +++ /dev/null @@ -1,74 +0,0 @@ -import json -from argparse import ArgumentParser -from pathlib import Path - - -""" -This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers -library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded -without needing trust_remote_code=True. -""" - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument( - "--checkpoint_dir", - type=Path, - required=True, - help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.", - ) - args = parser.parse_args() - - if not args.checkpoint_dir.is_dir(): - raise ValueError("--checkpoint_dir argument should be a directory!") - - if ( - not (args.checkpoint_dir / "configuration_RW.py").is_file() - or not (args.checkpoint_dir / "modelling_RW.py").is_file() - ): - raise ValueError( - "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?" - ) - (args.checkpoint_dir / "configuration_RW.py").unlink() - (args.checkpoint_dir / "modelling_RW.py").unlink() - - config = args.checkpoint_dir / "config.json" - text = config.read_text() - text = text.replace("RWForCausalLM", "FalconForCausalLM") - text = text.replace("RefinedWebModel", "falcon") - text = text.replace("RefinedWeb", "falcon") - json_config = json.loads(text) - del json_config["auto_map"] - - if "n_head" in json_config: - json_config["num_attention_heads"] = json_config.pop("n_head") - if "n_layer" in json_config: - json_config["num_hidden_layers"] = json_config.pop("n_layer") - if "n_head_kv" in json_config: - json_config["num_kv_heads"] = json_config.pop("n_head_kv") - json_config["new_decoder_architecture"] = True - else: - json_config["new_decoder_architecture"] = False - bos_token_id = json_config.get("bos_token_id", 1) - eos_token_id = json_config.get("eos_token_id", 2) - config.unlink() - config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - tokenizer_config = args.checkpoint_dir / "tokenizer_config.json" - if tokenizer_config.is_file(): - text = tokenizer_config.read_text() - json_config = json.loads(text) - if json_config["tokenizer_class"] == "PreTrainedTokenizerFast": - json_config["model_input_names"] = ["input_ids", "attention_mask"] - tokenizer_config.unlink() - tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - generation_config_path = args.checkpoint_dir / "generation_config.json" - generation_dict = { - "_from_model_config": True, - "bos_token_id": bos_token_id, - "eos_token_id": eos_token_id, - "transformers_version": "4.33.0.dev0", - } - generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True)) - print("Done! Please double-check that the new checkpoint works as expected.") diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index bb9c432f8229..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,210 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse -import json -import re -from pathlib import Path -from tempfile import TemporaryDirectory - -import torch -import yaml - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerTokenizer, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - -CONFIG_MAPPING = { - "adim": "hidden_size", - "aheads": "num_attention_heads", - "conformer_dec_kernel_size": "decoder_kernel_size", - "conformer_enc_kernel_size": "encoder_kernel_size", - "decoder_normalize_before": "decoder_normalize_before", - "dlayers": "decoder_layers", - "dunits": "decoder_linear_units", - "duration_predictor_chans": "duration_predictor_channels", - "duration_predictor_kernel_size": "duration_predictor_kernel_size", - "duration_predictor_layers": "duration_predictor_layers", - "elayers": "encoder_layers", - "encoder_normalize_before": "encoder_normalize_before", - "energy_embed_dropout": "energy_embed_dropout", - "energy_embed_kernel_size": "energy_embed_kernel_size", - "energy_predictor_chans": "energy_predictor_channels", - "energy_predictor_dropout": "energy_predictor_dropout", - "energy_predictor_kernel_size": "energy_predictor_kernel_size", - "energy_predictor_layers": "energy_predictor_layers", - "eunits": "encoder_linear_units", - "pitch_embed_dropout": "pitch_embed_dropout", - "pitch_embed_kernel_size": "pitch_embed_kernel_size", - "pitch_predictor_chans": "pitch_predictor_channels", - "pitch_predictor_dropout": "pitch_predictor_dropout", - "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", - "pitch_predictor_layers": "pitch_predictor_layers", - "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", - "postnet_chans": "speech_decoder_postnet_units", - "postnet_filts": "speech_decoder_postnet_kernel", - "postnet_layers": "speech_decoder_postnet_layers", - "reduction_factor": "reduction_factor", - "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", - "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", - "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", - "transformer_dec_dropout_rate": "decoder_dropout_rate", - "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", - "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", - "transformer_enc_dropout_rate": "encoder_dropout_rate", - "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", - "use_cnn_in_conformer": "use_cnn_in_conformer", - "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", - "use_masking": "use_masking", - "use_weighted_masking": "use_weighted_masking", - "idim": "input_dim", - "odim": "num_mel_bins", - "spk_embed_dim": "speaker_embed_dim", - "langs": "num_languages", - "spks": "num_speakers", -} - - -def remap_model_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - remapped_config = {} - - model_params = args.tts_conf["text2mel_params"] - # espnet_config_key -> hf_config_key, any keys not included are ignored - for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): - if espnet_config_key in model_params: - remapped_config[hf_config_key] = model_params[espnet_config_key] - - return remapped_config, args.g2p, args.token_list - - -def convert_espnet_state_dict_to_hf(state_dict): - new_state_dict = {} - for key in state_dict: - if "tts.generator.text2mel." in key: - new_key = key.replace("tts.generator.text2mel.", "") - if "postnet" in key: - new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") - new_key = new_key.replace(".0.weight", ".conv.weight") - new_key = new_key.replace(".1.weight", ".batch_norm.weight") - new_key = new_key.replace(".1.bias", ".batch_norm.bias") - new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") - new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") - new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") - if "feat_out" in key: - if "weight" in key: - new_key = "speech_decoder_postnet.feat_out.weight" - if "bias" in key: - new_key = "speech_decoder_postnet.feat_out.bias" - if "encoder.embed.0.weight" in key: - new_key = new_key.replace("0.", "") - if "w_1" in key: - new_key = new_key.replace("w_1", "conv1") - if "w_2" in key: - new_key = new_key.replace("w_2", "conv2") - if "predictor.conv" in key: - new_key = new_key.replace(".conv", ".conv_layers") - pattern = r"(\d)\.(\d)" - replacement = ( - r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" - ) - new_key = re.sub(pattern, replacement, new_key) - if "pitch_embed" in key or "energy_embed" in key: - new_key = new_key.replace("0", "conv") - if "encoders" in key: - new_key = new_key.replace("encoders", "conformer_layers") - new_key = new_key.replace("norm_final", "final_layer_norm") - new_key = new_key.replace("norm_mha", "self_attn_layer_norm") - new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") - new_key = new_key.replace("norm_ff", "ff_layer_norm") - new_key = new_key.replace("norm_conv", "conv_layer_norm") - if "lid_emb" in key: - new_key = new_key.replace("lid_emb", "language_id_embedding") - if "sid_emb" in key: - new_key = new_key.replace("sid_emb", "speaker_id_embedding") - - new_state_dict[new_key] = state_dict[key] - - return new_state_dict - - -@torch.no_grad() -def convert_FastSpeech2ConformerModel_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) - config = FastSpeech2ConformerConfig(**model_params) - - # Prepare the model - model = FastSpeech2ConformerModel(config) - - espnet_checkpoint = torch.load(checkpoint_path) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - - model.load_state_dict(hf_compatible_state_dict) - - model.save_pretrained(pytorch_dump_folder_path) - - # Prepare the tokenizer - with TemporaryDirectory() as tempdir: - vocab = {token: id for id, token in enumerate(vocab)} - vocab_file = Path(tempdir) / "vocab.json" - with open(vocab_file, "w") as f: - json.dump(vocab, f) - should_strip_spaces = "no_space" in tokenizer_name - tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) - - tokenizer.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - tokenizer.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_FastSpeech2ConformerModel_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py deleted file mode 100644 index ec9f57ce7142..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" - -import argparse -from pathlib import Path - -import torch -import yaml - -from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def load_weights(checkpoint, hf_model, config): - vocoder_key_prefix = "tts.generator.vocoder." - checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} - - hf_model.apply_weight_norm() - - hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] - hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] - hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] - - for i in range(len(config.upsample_rates)): - hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] - hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] - hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] - - for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): - for j in range(len(config.resblock_dilation_sizes)): - hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] - hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] - hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] - - hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] - hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] - hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] - - hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] - hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] - hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] - - hf_model.remove_weight_norm() - - -def remap_hifigan_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - vocoder_type = args.tts_conf["vocoder_type"] - if vocoder_type != "hifigan_generator": - raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") - - remapped_dict = {} - vocoder_params = args.tts_conf["vocoder_params"] - - # espnet_config_key -> hf_config_key - key_mappings = { - "channels": "upsample_initial_channel", - "in_channels": "model_in_dim", - "resblock_dilations": "resblock_dilation_sizes", - "resblock_kernel_sizes": "resblock_kernel_sizes", - "upsample_kernel_sizes": "upsample_kernel_sizes", - "upsample_scales": "upsample_rates", - } - for espnet_config_key, hf_config_key in key_mappings.items(): - remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] - remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] - remapped_dict["normalize_before"] = False - remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] - - return remapped_dict - - -@torch.no_grad() -def convert_hifigan_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - yaml_config_path=None, - repo_id=None, -): - if yaml_config_path is not None: - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - else: - config = FastSpeech2ConformerHifiGanConfig() - - model = FastSpeech2ConformerHifiGan(config) - - orig_checkpoint = torch.load(checkpoint_path) - load_weights(orig_checkpoint, model, config) - - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_hifigan_checkpoint( - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.yaml_config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py deleted file mode 100644 index 2a780d5cf0b8..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse - -import torch - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerHifiGan, - FastSpeech2ConformerHifiGanConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerWithHifiGan, - FastSpeech2ConformerWithHifiGanConfig, - logging, -) - -from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( - convert_espnet_state_dict_to_hf, - remap_model_yaml_config, -) -from .convert_hifigan import load_weights, remap_hifigan_yaml_config - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def convert_FastSpeech2ConformerWithHifiGan_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - # Prepare the model - model_params, *_ = remap_model_yaml_config(yaml_config_path) - model_config = FastSpeech2ConformerConfig(**model_params) - - model = FastSpeech2ConformerModel(model_config) - - espnet_checkpoint = torch.load(checkpoint_path) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - model.load_state_dict(hf_compatible_state_dict) - - # Prepare the vocoder - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - - vocoder = FastSpeech2ConformerHifiGan(vocoder_config) - load_weights(espnet_checkpoint, vocoder, vocoder_config) - - # Prepare the model + vocoder - config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) - with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) - with_hifigan_model.model = model - with_hifigan_model.vocoder = vocoder - - with_hifigan_model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - with_hifigan_model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - required=True, - default=None, - type=str, - help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - - convert_FastSpeech2ConformerWithHifiGan_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py b/src/transformers/models/flava/convert_dalle_to_flava_codebook.py deleted file mode 100644 index 7b544125114c..000000000000 --- a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaImageCodebook, FlavaImageCodebookConfig - - -def rreplace(s, old, new, occurrence): - li = s.rsplit(old, occurrence) - return new.join(li) - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict): - upgrade = {} - - group_keys = ["group_1", "group_2", "group_3", "group_4"] - for key, value in state_dict.items(): - for group_key in group_keys: - if group_key in key: - key = key.replace(f"{group_key}.", f"{group_key}.group.") - - if "res_path" in key: - key = key.replace("res_path.", "res_path.path.") - - if key.endswith(".w"): - key = rreplace(key, ".w", ".weight", 1) - if key.endswith(".b"): - key = rreplace(key, ".b", ".bias", 1) - - upgrade[key] = value.float() - - return upgrade - - -@torch.no_grad() -def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True): - """ - Copy/paste/tweak model's weights to transformers design. - """ - from dall_e import Encoder - - encoder = Encoder() - if os.path.exists(checkpoint_path): - ckpt = torch.load(checkpoint_path) - else: - ckpt = torch.hub.load_state_dict_from_url(checkpoint_path) - - if isinstance(ckpt, Encoder): - ckpt = ckpt.state_dict() - encoder.load_state_dict(ckpt) - - if config_path is not None: - config = FlavaImageCodebookConfig.from_pretrained(config_path) - else: - config = FlavaImageCodebookConfig() - - hf_model = FlavaImageCodebook(config).eval() - state_dict = encoder.state_dict() - - hf_state_dict = upgrade_state_dict(state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - if save_checkpoint: - hf_model.save_pretrained(pytorch_dump_folder_path) - else: - return hf_state_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py b/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py deleted file mode 100644 index 95ebb2bfdb23..000000000000 --- a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaConfig, FlavaForPreTraining -from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict, codebook_state_dict): - upgrade = {} - - for key, value in state_dict.items(): - if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key: - continue - - key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head") - key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head") - key = key.replace("heads.cmd.itm_head.cls", "itm_head") - key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler") - key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale") - key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head") - key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head") - key = key.replace("mm_text_projection", "flava.text_to_mm_projection") - key = key.replace("mm_image_projection", "flava.image_to_mm_projection") - key = key.replace("image_encoder.module", "flava.image_model") - key = key.replace("text_encoder.module", "flava.text_model") - key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token") - key = key.replace("mm_encoder.module", "flava.multimodal_model") - key = key.replace("text_projection", "flava.text_projection") - key = key.replace("image_projection", "flava.image_projection") - - upgrade[key] = value.float() - - for key, value in codebook_state_dict.items(): - upgrade[f"image_codebook.{key}"] = value - - return upgrade - - -@torch.no_grad() -def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = FlavaConfig.from_pretrained(config_path) - else: - config = FlavaConfig() - - hf_model = FlavaForPreTraining(config).eval() - - codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False) - - if os.path.exists(checkpoint_path): - state_dict = torch.load(checkpoint_path, map_location="cpu") - else: - state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu") - - hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py deleted file mode 100644 index 71660354db14..000000000000 --- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FNet checkpoint.""" - -import argparse - -import torch -from flax.training.checkpoints import restore_checkpoint - -from transformers import FNetConfig, FNetForPreTraining -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_flax_checkpoint_to_pytorch(flax_checkpoint_path, fnet_config_file, save_path): - # Initialise PyTorch model - config = FNetConfig.from_json_file(fnet_config_file) - print(f"Building PyTorch model from configuration: {config}") - fnet_pretraining_model = FNetForPreTraining(config) - - checkpoint_dict = restore_checkpoint(flax_checkpoint_path, None) - pretrained_model_params = checkpoint_dict["target"] - - # Embeddings - # Position IDs - state_dict = fnet_pretraining_model.state_dict() - - position_ids = state_dict["fnet.embeddings.position_ids"] - new_state_dict = {"fnet.embeddings.position_ids": position_ids} - # Embedding Layers - new_state_dict["fnet.embeddings.word_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["fnet.embeddings.position_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["position"]["embedding"][0] - ) - new_state_dict["fnet.embeddings.token_type_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["type"]["embedding"] - ) - new_state_dict["fnet.embeddings.projection.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["kernel"] - ).T - new_state_dict["fnet.embeddings.projection.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["bias"] - ) - new_state_dict["fnet.embeddings.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["scale"] - ) - new_state_dict["fnet.embeddings.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["bias"] - ) - - # Encoder Layers - for layer in range(config.num_hidden_layers): - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["bias"] - ) - - # Pooler Layers - new_state_dict["fnet.pooler.dense.weight"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["kernel"]).T - new_state_dict["fnet.pooler.dense.bias"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["bias"]) - - # Masked LM Layers - new_state_dict["cls.predictions.transform.dense.weight"] = torch.tensor( - pretrained_model_params["predictions_dense"]["kernel"] - ).T - new_state_dict["cls.predictions.transform.dense.bias"] = torch.tensor( - pretrained_model_params["predictions_dense"]["bias"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["scale"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["bias"] - ) - new_state_dict["cls.predictions.decoder.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["cls.predictions.decoder.bias"] = torch.tensor( - pretrained_model_params["predictions_output"]["output_bias"] - ) - new_state_dict["cls.predictions.bias"] = torch.tensor(pretrained_model_params["predictions_output"]["output_bias"]) - - # Seq Relationship Layers - new_state_dict["cls.seq_relationship.weight"] = torch.tensor( - pretrained_model_params["classification"]["output_kernel"] - ) - new_state_dict["cls.seq_relationship.bias"] = torch.tensor( - pretrained_model_params["classification"]["output_bias"] - ) - - # Load State Dict - fnet_pretraining_model.load_state_dict(new_state_dict) - - # Save PreTrained - print(f"Saving pretrained model to {save_path}") - fnet_pretraining_model.save_pretrained(save_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--flax_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--fnet_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained FNet model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument("--save_path", default=None, type=str, required=True, help="Path to the output model.") - args = parser.parse_args() - convert_flax_checkpoint_to_pytorch(args.flax_checkpoint_path, args.fnet_config_file, args.save_path) diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py deleted file mode 100644 index 4aed15928062..000000000000 --- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py +++ /dev/null @@ -1,237 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main""" - -import argparse -import json - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def get_focalnet_config(model_name): - depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2] - use_conv_embed = True if "large" in model_name or "huge" in model_name else False - use_post_layernorm = True if "large" in model_name or "huge" in model_name else False - use_layerscale = True if "large" in model_name or "huge" in model_name else False - - if "large" in model_name or "xlarge" in model_name or "huge" in model_name: - if "fl3" in model_name: - focal_levels = [3, 3, 3, 3] - focal_windows = [5, 5, 5, 5] - elif "fl4" in model_name: - focal_levels = [4, 4, 4, 4] - focal_windows = [3, 3, 3, 3] - - if "tiny" in model_name or "small" in model_name or "base" in model_name: - focal_windows = [3, 3, 3, 3] - if "lrf" in model_name: - focal_levels = [3, 3, 3, 3] - else: - focal_levels = [2, 2, 2, 2] - - if "tiny" in model_name: - embed_dim = 96 - elif "small" in model_name: - embed_dim = 96 - elif "base" in model_name: - embed_dim = 128 - elif "large" in model_name: - embed_dim = 192 - elif "xlarge" in model_name: - embed_dim = 256 - elif "huge" in model_name: - embed_dim = 352 - - # set label information - repo_id = "huggingface/label-files" - if "large" in model_name or "huge" in model_name: - filename = "imagenet-22k-id2label.json" - else: - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - config = FocalNetConfig( - embed_dim=embed_dim, - depths=depths, - focal_levels=focal_levels, - focal_windows=focal_windows, - use_conv_embed=use_conv_embed, - id2label=id2label, - label2id=label2id, - use_post_layernorm=use_post_layernorm, - use_layerscale=use_layerscale, - ) - - return config - - -def rename_key(name): - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if "layers" in name: - name = "encoder." + name - if "encoder.layers" in name: - name = name.replace("encoder.layers", "encoder.stages") - if "downsample.proj" in name: - name = name.replace("downsample.proj", "downsample.projection") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "modulation.f.weight" in name or "modulation.f.bias" in name: - name = name.replace("modulation.f", "modulation.projection_in") - if "modulation.h.weight" in name or "modulation.h.bias" in name: - name = name.replace("modulation.h", "modulation.projection_context") - if "modulation.proj.weight" in name or "modulation.proj.bias" in name: - name = name.replace("modulation.proj", "modulation.projection_out") - - if name == "norm.weight": - name = "layernorm.weight" - if name == "norm.bias": - name = "layernorm.bias" - - if "head" in name: - name = name.replace("head", "classifier") - else: - name = "focalnet." + name - - return name - - -def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - # fmt: off - model_name_to_url = { - "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth", - "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth", - "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth", - "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth", - "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth", - "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth", - "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", - "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth", - "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth", - "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth", - } - # fmt: on - - checkpoint_url = model_name_to_url[model_name] - print("Checkpoint URL: ", checkpoint_url) - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - - config = get_focalnet_config(model_name) - model = FocalNetForImageClassification(config) - model.eval() - - # load state dict - model.load_state_dict(state_dict) - - # verify conversion - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": 256}, - resample=PILImageResampling.BILINEAR, - do_center_crop=True, - crop_size=224, - do_normalize=True, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - image = Image.open(requests.get(url, stream=True).raw) - inputs = processor(images=image, return_tensors="pt") - - image_transforms = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - - original_pixel_values = image_transforms(image).unsqueeze(0) - - # verify pixel_values - assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4) - - outputs = model(**inputs) - - predicted_class_idx = outputs.logits.argmax(-1).item() - print("Predicted class:", model.config.id2label[predicted_class_idx]) - - print("First values of logits:", outputs.logits[0, :3]) - - if model_name == "focalnet-tiny": - expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]) - elif model_name == "focalnet-tiny-lrf": - expected_slice = torch.tensor([1.1669, 0.0125, -0.1695]) - elif model_name == "focalnet-small": - expected_slice = torch.tensor([0.4917, -0.0430, 0.1341]) - elif model_name == "focalnet-small-lrf": - expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331]) - elif model_name == "focalnet-base": - expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730]) - elif model_name == "focalnet-base-lrf": - expected_slice = torch.tensor([0.5306, -0.0483, -0.3928]) - assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"{model_name}") - processor.push_to_hub(f"{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="focalnet-tiny", - type=str, - help="Name of the FocalNet model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub.", - ) - - args = parser.parse_args() - convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index ef2764f0ed10..000000000000 --- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,280 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: if you intend to run this script make sure you look under scripts/fsmt/ -# to locate the appropriate script to do the work correctly. There is a set of scripts to: -# - download and prepare data and run the conversion script -# - perform eval to get the best hparam into the config -# - generate model_cards - useful if you have multiple models from the same paper - -import argparse -import json -import os -import re -from collections import OrderedDict -from os.path import basename, dirname - -import fairseq -import torch -from fairseq import hub_utils -from fairseq.data.dictionary import Dictionary - -from transformers import FSMTConfig, FSMTForConditionalGeneration -from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - -# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping` -# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults: -# -# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users) -# * `early_stopping`: `False` consistently scored better -# * `length_penalty` varied, so will assign the best one depending on the model -best_score_hparams = { - # fairseq: - "wmt19-ru-en": {"length_penalty": 1.1}, - "wmt19-en-ru": {"length_penalty": 1.15}, - "wmt19-en-de": {"length_penalty": 1.0}, - "wmt19-de-en": {"length_penalty": 1.1}, - # allenai: - "wmt16-en-de-dist-12-1": {"length_penalty": 0.6}, - "wmt16-en-de-dist-6-1": {"length_penalty": 0.6}, - "wmt16-en-de-12-1": {"length_penalty": 0.8}, - "wmt19-de-en-6-6-base": {"length_penalty": 0.6}, - "wmt19-de-en-6-6-big": {"length_penalty": 0.6}, -} - -# this remaps the different models to their organization names -org_names = {} -for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]: - org_names[m] = "facebook" -for m in [ - "wmt16-en-de-dist-12-1", - "wmt16-en-de-dist-6-1", - "wmt16-en-de-12-1", - "wmt19-de-en-6-6-base", - "wmt19-de-en-6-6-big", -]: - org_names[m] = "allenai" - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = " ".split() - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): - # prep - assert os.path.exists(fsmt_checkpoint_path) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = basename(fsmt_checkpoint_path) - fsmt_folder_path = dirname(fsmt_checkpoint_path) - - cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel - models = cls.hub_models() - kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} - data_name_or_path = "." - # note: since the model dump is old, fairseq has upgraded its model some - # time later, and it does a whole lot of rewrites and splits on the saved - # weights, therefore we can't use torch.load() directly on the model file. - # see: upgrade_state_dict(state_dict) in fairseq_model.py - print(f"using checkpoint {checkpoint_file}") - chkpt = hub_utils.from_pretrained( - fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs - ) - - args = vars(chkpt["args"]["model"]) - - src_lang = args["source_lang"] - tgt_lang = args["target_lang"] - - data_root = dirname(pytorch_dump_folder_path) - model_dir = basename(pytorch_dump_folder_path) - - # dicts - src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") - tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") - - src_dict = Dictionary.load(src_dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") - print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # detect whether this is a do_lower_case situation, which can be derived by checking whether we - # have at least one uppercase letter in the source vocab - do_lower_case = True - for k in src_vocab.keys(): - if not k.islower(): - do_lower_case = False - break - - tgt_dict = Dictionary.load(tgt_dict_file) - tgt_vocab = rewrite_dict_keys(tgt_dict.indices) - tgt_vocab_size = len(tgt_vocab) - tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") - print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records") - with open(tgt_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" - fsmt_merges_file = os.path.join(fsmt_folder_path, fn) - if os.path.exists(fsmt_merges_file): - break - with open(fsmt_merges_file, encoding="utf-8") as fin: - merges = fin.read() - merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number - print(f"Generating {merges_file}") - with open(merges_file, "w", encoding="utf-8") as fout: - fout.write(merges) - - # model config - fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - - # may have to modify the tokenizer if a different type is used by a future model - assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" - assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" - - model_conf = { - "architectures": ["FSMTForConditionalGeneration"], - "model_type": "fsmt", - "activation_dropout": args["activation_dropout"], - "activation_function": "relu", - "attention_dropout": args["attention_dropout"], - "d_model": args["decoder_embed_dim"], - "dropout": args["dropout"], - "init_std": 0.02, - "max_position_embeddings": args["max_source_positions"], - "num_hidden_layers": args["encoder_layers"], - "src_vocab_size": src_vocab_size, - "tgt_vocab_size": tgt_vocab_size, - "langs": [src_lang, tgt_lang], - "encoder_attention_heads": args["encoder_attention_heads"], - "encoder_ffn_dim": args["encoder_ffn_embed_dim"], - "encoder_layerdrop": args["encoder_layerdrop"], - "encoder_layers": args["encoder_layers"], - "decoder_attention_heads": args["decoder_attention_heads"], - "decoder_ffn_dim": args["decoder_ffn_embed_dim"], - "decoder_layerdrop": args["decoder_layerdrop"], - "decoder_layers": args["decoder_layers"], - "bos_token_id": 0, - "pad_token_id": 1, - "eos_token_id": 2, - "is_encoder_decoder": True, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_all_embeddings"], - } - - # good hparam defaults to start with - model_conf["num_beams"] = 5 - model_conf["early_stopping"] = False - if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]: - model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"] - else: - model_conf["length_penalty"] = 1.0 - - print(f"Generating {fsmt_model_config_file}") - with open(fsmt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "langs": [src_lang, tgt_lang], - "model_max_length": 1024, - "do_lower_case": do_lower_case, - } - - print(f"Generating {fsmt_tokenizer_config_file}") - with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model = chkpt["models"][0] - model_state_dict = model.state_dict() - - # rename keys to start with 'model.' - model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items()) - - # remove unneeded keys - ignore_keys = [ - "model.model", - "model.encoder.version", - "model.decoder.version", - "model.encoder_embed_tokens.weight", - "model.decoder_embed_tokens.weight", - "model.encoder.embed_positions._float_tensor", - "model.decoder.embed_positions._float_tensor", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) - model_new = FSMTForConditionalGeneration(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict, strict=False) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - print("\nLast step is to upload the files to s3") - print(f"cd {data_root}") - print(f"transformers-cli upload {model_dir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--fsmt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 37f71c0d233e..000000000000 --- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Funnel checkpoint.""" - -import argparse - -import torch - -from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model): - # Initialise PyTorch model - config = FunnelConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = FunnelBaseModel(config) if base_model else FunnelModel(config) - - # Load weights from tf checkpoint - load_tf_weights_in_funnel(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model - ) - - -__all__ = [] diff --git a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py deleted file mode 100644 index 6d029c0d13ab..000000000000 --- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import sys -import warnings - -import flatdict -import torch - -from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer - - -try: - from transformers import LlamaTokenizerFast - - tokenizer_class = LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - tokenizer_class = LlamaTokenizer - -""" -Sample usage: # TODO fix clone links from persimmon to fuyu -``` -git clone https://github.com/adept-ai-labs/adept-inference -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar -python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import FuyuForCausalLM, FuyuTokenizer - -model = FuyuForCausalLM.from_pretrained("/output/path") -tokenizer = FuyuTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - - -KEYS_TO_MODIFY_MAPPING = { - "self_attention": "self_attn", - "language_model.encoder": "language_model.model", - "word_embeddings_for_head": "language_model.lm_head", - "language_model.embedding.word_embeddings": "language_model.model.embed_tokens", - "vit_encoder.linear_encoder": "vision_embed_tokens", -} - -KEYS_TO_REMOVE = { - "rotary_emb.inv_freq", - "image_patch_projection", - "image_patch_projection.weight", - "image_patch_projection.bias", -} - - -def rename_state_dict(state_dict): - model_state_dict = {} - for key, value in state_dict.items(): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - # if KEYS_TO_REMOVE in key: - if key in KEYS_TO_REMOVE: - continue - model_state_dict[key] = value - return model_state_dict - - -def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False): - sys.path.insert(0, ada_lib_path) - model_state_dict_base = torch.load(pt_model_path, map_location="cpu") - state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".") - state_dict = rename_state_dict(state_dict) - - transformers_config = FuyuConfig() - model = FuyuForCausalLM(transformers_config).to(torch.bfloat16) - model.load_state_dict(state_dict) - model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Fuyu weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--pt_model_path", - help="Location of Fuyu `model_optim_rng.pt`", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--ada_lib_path", - help="Location of original source code from adept to deserialize .pt checkpoint", - ) - parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") - args = parser.parse_args() - spm_path = os.path.join(args.input_dir, "adept_vocab.model") - - convert_fuyu_checkpoint( - pytorch_dump_folder_path=args.output_dir, - pt_model_path=args.pt_model_path, - safe_serialization=args.safe_serialization, - ada_lib_path=args.ada_lib_path, - ) - tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|") - tokenizer.save_pretrained(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py deleted file mode 100644 index 9b71be35bfa1..000000000000 --- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import GemmaForCausalLM, GemmaTokenizerFast - -model = GemmaForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_2b_config = GemmaConfig( - num_hidden_layers=18, - num_attention_heads=8, - num_key_value_heads=1, - hidden_size=2048, - intermediate_size=16384, -) - -gemma_7b_config = GemmaConfig() - -CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split(v, v.shape[0] // 3, 0) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma model.") - with init_empty_weights(): - model = GemmaForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.torch_dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma tokenizer model", - ) - parser.add_argument( - "--model_size", - default="7B", - choices=["2B", "7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-7b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py deleted file mode 100644 index 1ad7d23c3c3e..000000000000 --- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Gemma2ForCausalLM, GemmaTokenizerFast - -model = Gemma2ForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_9b_config = Gemma2Config( - num_hidden_layers=42, - num_attention_heads=16, - num_key_value_heads=8, - hidden_size=3584, - intermediate_size=14336, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=256, - sliding_window=4096, - query_pre_attn_scalar=224, -) - -gemma_27b_config = Gemma2Config( - num_hidden_layers=46, - num_attention_heads=32, - num_key_value_heads=16, - hidden_size=4608, - intermediate_size=36864, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=128, - sliding_window=4096, - query_pre_attn_scalar=144, -) - -CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - - if os.path.isdir(input_base_path): - print("Model seems sharded") - - model_state_dict = {} - files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")] - - for file in files: - print(file) - loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu") - model_state_dict.update(loaded_state_dict) - else: - print("Model does not seem to be sharded") - model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split( - v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0 - ) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma2 model.") - with init_empty_weights(): - model = Gemma2ForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.torch_dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma2 weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma2 tokenizer model", - ) - parser.add_argument( - "--model_size", - default="9B", - choices=["9B", "27B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-9b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - if not args.model_size == "tokenizer_only": - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py deleted file mode 100644 index 2f93a6b03a65..000000000000 --- a/src/transformers/models/git/convert_git_to_pytorch.py +++ /dev/null @@ -1,448 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GIT checkpoints from the original repository. - -URL: https://github.com/microsoft/GenerativeImage2Text/tree/main""" - -import argparse -from pathlib import Path - -import av -import numpy as np -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - AutoTokenizer, - CLIPImageProcessor, - GitConfig, - GitForCausalLM, - GitProcessor, - GitVisionConfig, - VideoMAEImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_git_config(model_name): - if "base" in model_name and "vqa" in model_name: - image_size = 480 - elif "large" in model_name and "vqa" in model_name: - image_size = 420 - else: - image_size = 224 - - vision_config = GitVisionConfig(image_size=image_size) - - if "large" in model_name: - vision_config.patch_size = 14 - vision_config.hidden_size = 1024 - vision_config.intermediate_size = 4096 - vision_config.num_hidden_layers = 24 - vision_config.num_attention_heads = 16 - - is_video = "vatex" in model_name or "msrvtt" in model_name - num_image_with_embedding = 6 if is_video else None - config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding) - - return config, image_size, is_video - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, prefix=""): - rename_keys = [] - - # image encoder - # ftm: off - rename_keys.append( - (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding") - ) - rename_keys.append( - ( - f"{prefix}image_encoder.positional_embedding", - "git.image_encoder.vision_model.embeddings.position_embedding.weight", - ) - ) - rename_keys.append( - (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight")) - rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias")) - rename_keys.append( - (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias")) - # fmt: on - rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight")) - - # fmt: off - for i in range(config.vision_config.num_hidden_layers): - # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias")) - # fmt: on - - # text decoder - # fmt: off - rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight")) - rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias")) - rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias")) - - rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.output.weight", "output.weight")) - rename_keys.append((f"{prefix}textual.output.bias", "output.bias")) - for i in range(config.num_hidden_layers): - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias")) - # fmt: on - - if config.num_image_with_embedding is not None: - rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0")) - rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1")) - rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2")) - rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3")) - rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4")) - rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5")) - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val.T if "image_encoder.visual_projection" in new else val - - -# we split up the matrix of each CLIP encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, prefix=""): - dim = config.vision_config.hidden_size - for i in range(config.vision_config.num_hidden_layers): - # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[ - :dim, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[ - -dim:, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:] - - -# We will verify our results on an image -def prepare_img(model_name): - if "textvqa" in model_name: - filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") - image = Image.open(filepath).convert("RGB") - else: - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def prepare_video(): - def read_video_pyav(container, indices): - """ - Decode the video with PyAV decoder. - - Args: - container (`av.container.input.InputContainer`): PyAV container. - indices (`List[int]`): List of frame indices to decode. - - Returns: - result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). - """ - frames = [] - container.seek(0) - start_index = indices[0] - end_index = indices[-1] - for i, frame in enumerate(container.decode(video=0)): - if i > end_index: - break - if i >= start_index and i in indices: - frames.append(frame) - return np.stack([x.to_ndarray(format="rgb24") for x in frames]) - - def sample_frame_indices(clip_len, frame_sample_rate, seg_len): - """ - Sample a given number of frame indices from the video. - - Args: - clip_len (`int`): Total number of frames to sample. - frame_sample_rate (`int`): Sample every n-th frame. - seg_len (`int`): Maximum allowed index of sample's last frame. - - Returns: - indices (`List[int]`): List of sampled frame indices - """ - converted_len = int(clip_len * frame_sample_rate) - end_idx = np.random.randint(converted_len, seg_len) - start_idx = end_idx - converted_len - indices = np.linspace(start_idx, end_idx, num=clip_len) - indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) - return indices - - # set seed for reproducibility - np.random.seed(0) - - file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") - with av.open(file_path) as container: - # sample 6 frames - num_frames = 6 - indices = sample_frame_indices( - clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames - ) - frames = read_video_pyav(container, indices) - - return frames - - -@torch.no_grad() -def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our GIT structure. - """ - - model_name_to_url = { - "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt", - "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt", - "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt", - "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt", - "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt", # todo - "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt", - "git-base-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt", - "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt", - "git-large-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt" - ), - "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt", - "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt", - "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt", - "git-large-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt", - "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt", - "git-large-r-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt" - ), - } - - model_name_to_path = { - "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt", - "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt", - "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt", - "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt", - "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt", - } - - # define GIT configuration based on model name - config, image_size, is_video = get_git_config(model_name) - if "large" in model_name and not is_video and "large-r" not in model_name: - # large checkpoints take way too long to download - checkpoint_path = model_name_to_path[model_name] - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - else: - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[ - "model" - ] - # rename keys - prefix = "module." if model_name == "git-base" else "" - rename_keys = create_rename_keys(config, prefix=prefix) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, prefix=prefix) - - # load HuggingFace model - model = GitForCausalLM(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - model.eval() - - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"] - assert unexpected_keys == ["git.image_encoder.visual_projection.weight"] - - # verify results - image_processor = ( - VideoMAEImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - if is_video - else CLIPImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - ) - tokenizer = AutoTokenizer.from_pretrained( - "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"] - ) - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - if is_video: - video = prepare_video() - pixel_values = processor(images=list(video), return_tensors="pt").pixel_values - else: - image = prepare_img(model_name) - image_transforms = Compose( - [ - Resize(image_size, interpolation=Image.BICUBIC), - CenterCrop(image_size), - ToTensor(), - Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - assert torch.allclose(pixel_values, original_pixel_values) - - input_ids = torch.tensor([[101]]) - outputs = model(input_ids, pixel_values=pixel_values) - logits = outputs.logits - print("Logits:", logits[0, -1, :3]) - - if model_name == "git-base": - expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840]) - elif model_name == "git-base-coco": - expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935]) - elif model_name == "git-base-textcaps": - expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985]) - elif model_name == "git-base-vqav2": - expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561]) - elif model_name == "git-base-textvqa": - expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082]) - elif model_name == "git-base-vatex": - expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447]) - elif model_name == "git-base-msrvtt-qa": - expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540]) - elif model_name == "git-large": - expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705]) - elif model_name == "git-large-coco": - expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422]) - elif model_name == "git-large-textcaps": - expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706]) - elif model_name == "git-large-vqav2": - expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043]) - elif model_name == "git-large-textvqa": - expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590]) - elif model_name == "git-large-vatex": - expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113]) - elif model_name == "git-large-msrvtt-qa": - expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131]) - elif model_name == "git-large-r": - expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286]) - elif model_name == "git-large-r-coco": - expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641]) - elif model_name == "git-large-r-textcaps": - expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124]) - - assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4) - print("Looks ok!") - - prompt = "" - if "textvqa" in model_name: - prompt = "what does the front of the bus say at the top?" - elif "msrvtt-qa" in model_name: - prompt = "what does the woman eat?" - elif "vqa" in model_name: - prompt = "what are the cats doing?" - input_ids = tokenizer(prompt, add_special_tokens=False).input_ids - input_ids = [processor.tokenizer.cls_token_id] + input_ids - input_ids = torch.tensor(input_ids).unsqueeze(0) - print("Generating caption...") - generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) - print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"microsoft/{model_name}") - processor.push_to_hub(f"microsoft/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="git-base", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py deleted file mode 100644 index 1053f984d7f0..000000000000 --- a/src/transformers/models/glm/convert_glm_weights_to_hf.py +++ /dev/null @@ -1,195 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file -from tokenizers import processors - -from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"transformer.output_layer.weight": r"lm_head.weight", - - # Model keys - r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", - r"transformer.rotary_pos_emb.inv_freq": None, - r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", - # qkv_proj will later be split in q|k|v|_proj - r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", - - # MLP keys - r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", - r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] - - all_weights = {} - - if safetensor_files: - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - elif bin_files: - bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in bin_files: - tensors = torch.load(file, map_location="cpu") - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: GlmConfig): - new_dict = {} - - head_dim = config.hidden_size // config.num_attention_heads - query_size = config.num_attention_heads * head_dim - kv_size = config.num_key_value_heads * head_dim - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - - if "qkv_proj." in new_key: - q_proj, k_proj, v_proj = ( - value[:query_size, ...], - value[query_size : query_size + kv_size, ...], - value[query_size + kv_size :, ...], - ) - new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj - new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj - new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj - else: - new_dict[new_key] = value - return new_dict - - -def convert_config(original_config: dict): - key_mapping = { - "vocab_size": "padded_vocab_size", - "intermediate_size": "ffn_hidden_size", - "num_hidden_layers": "num_layers", - "max_position_embeddings": "seq_length", - "rms_norm_eps": "layernorm_epsilon", - "head_dim": "kv_channels", - "attention_bias": "add_qkv_bias", - } - similar_keys_to_keep = [ - "num_attention_heads", - "hidden_size", - "attention_dropout", - "use_cache", - "eos_token_id", - "pad_token_id", - "tie_word_embeddings", - ] - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - new_config_kwargs["num_key_value_heads"] = ( - new_config_kwargs["num_attention_heads"] - if not original_config["multi_query_attention"] - else original_config["multi_query_group_num"] - ) - new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) - - new_config = GlmConfig(**new_config_kwargs) - return new_config - - -def convert_glm_tokenizer(input_dir, use_post_processor=False): - fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - if use_post_processor: - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - else: - fast_tok._tokenizer.post_processor = processors.Sequence( - [processors.ByteLevel(trim_offsets=False)], - ) - return fast_tok - - -def convert_glm_model(input_dir, output_dir, use_post_processor=False): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - original_config = json.load(f) - config = convert_config(original_config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = GlmForCausalLM(config) - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - # Load and convert tokenizer - tokenizer = convert_glm_tokenizer(input_dir, use_post_processor) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--use_post_processor", - action="store_true", - help="Whether to apply post processor with special tokens", - ) - - args = parser.parse_args() - convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py deleted file mode 100644 index e19ee9381980..000000000000 --- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GLPN checkpoints.""" - -import argparse -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if key.startswith("module.encoder"): - key = key.replace("module.encoder", "glpn.encoder") - if key.startswith("module.decoder"): - key = key.replace("module.decoder", "decoder.stages") - if "patch_embed" in key: - # replace for example patch_embed1 by patch_embeddings.0 - idx = key[key.find("patch_embed") + len("patch_embed")] - key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}") - if "norm" in key: - key = key.replace("norm", "layer_norm") - if "glpn.encoder.layer_norm" in key: - # replace for example layer_norm1 by layer_norm.0 - idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")] - key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}") - if "layer_norm1" in key: - key = key.replace("layer_norm1", "layer_norm_1") - if "layer_norm2" in key: - key = key.replace("layer_norm2", "layer_norm_2") - if "block" in key: - # replace for example block1 by block.0 - idx = key[key.find("block") + len("block")] - key = key.replace(f"block{idx}", f"block.{int(idx)-1}") - if "attn.q" in key: - key = key.replace("attn.q", "attention.self.query") - if "attn.proj" in key: - key = key.replace("attn.proj", "attention.output.dense") - if "attn" in key: - key = key.replace("attn", "attention.self") - if "fc1" in key: - key = key.replace("fc1", "dense1") - if "fc2" in key: - key = key.replace("fc2", "dense2") - if "linear_pred" in key: - key = key.replace("linear_pred", "classifier") - if "linear_fuse" in key: - key = key.replace("linear_fuse.conv", "linear_fuse") - key = key.replace("linear_fuse.bn", "batch_norm") - if "linear_c" in key: - # replace for example linear_c4 by linear_c.3 - idx = key[key.find("linear_c") + len("linear_c")] - key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}") - if "bot_conv" in key: - key = key.replace("bot_conv", "0.convolution") - if "skip_conv1" in key: - key = key.replace("skip_conv1", "1.convolution") - if "skip_conv2" in key: - key = key.replace("skip_conv2", "2.convolution") - if "fusion1" in key: - key = key.replace("fusion1", "1.fusion") - if "fusion2" in key: - key = key.replace("fusion2", "2.fusion") - if "fusion3" in key: - key = key.replace("fusion3", "3.fusion") - if "fusion" in key and "conv" in key: - key = key.replace("conv", "convolutional_layer") - if key.startswith("module.last_layer_depth"): - key = key.replace("module.last_layer_depth", "head.head") - new_state_dict[key] = value - - return new_state_dict - - -def read_in_k_v(state_dict, config): - # for each of the encoder blocks: - for i in range(config.num_encoder_blocks): - for j in range(config.depths[i]): - # read in weights + bias of keys and values (which is a single matrix in the original implementation) - kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight") - kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias") - # next, add keys and values (in that order) to the state dict - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[ - : config.hidden_sizes[i], : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[ - config.hidden_sizes[i] :, : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :] - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -@torch.no_grad() -def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None): - """ - Copy/paste/tweak model's weights to our GLPN structure. - """ - - # load GLPN configuration (Segformer-B4 size) - config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3]) - - # load image processor (only resize + rescale) - image_processor = GLPNImageProcessor() - - # prepare image - image = prepare_img() - pixel_values = image_processor(images=image, return_tensors="pt").pixel_values - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) - - # rename keys - state_dict = rename_keys(state_dict) - - # key and value matrices need special treatment - read_in_k_v(state_dict, config) - - # create HuggingFace model and load state dict - model = GLPNForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # forward pass - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - # verify output - if model_name is not None: - if "nyu" in model_name: - expected_slice = torch.tensor( - [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]] - ) - elif "kitti" in model_name: - expected_slice = torch.tensor( - [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]] - ) - else: - raise ValueError(f"Unknown model name: {model_name}") - - expected_shape = torch.Size([1, 480, 640]) - - assert predicted_depth.shape == expected_shape - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - # finally, push to hub if required - if push_to_hub: - logger.info("Pushing model and image processor to the hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - default=None, - type=str, - help="Path to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - parser.add_argument( - "--model_name", - default="glpn-kitti", - type=str, - help="Name of the model in case you're pushing to the hub.", - ) - args = parser.parse_args() - convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 33f9dabed07f..000000000000 --- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI GPT checkpoint.""" - -import argparse - -import torch - -from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2 -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): - # Construct model - if gpt2_config_file == "": - config = GPT2Config() - else: - config = GPT2Config.from_json_file(gpt2_config_file) - model = GPT2Model(config) - - # Load weights from numpy - load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--gpt2_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained OpenAI model. \n" - "This specifies the model architecture." - ), - ) - args = parser.parse_args() - convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py deleted file mode 100644 index 3db22857293c..000000000000 --- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT Neo checkpoint.""" - -import argparse -import json - -from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config_json = json.load(open(config_file, "r")) - config = GPTNeoConfig( - hidden_size=config_json["n_embd"], - num_layers=config_json["n_layer"], - num_heads=config_json["n_head"], - attention_types=config_json["attention_types"], - max_position_embeddings=config_json["n_positions"], - resid_dropout=config_json["res_dropout"], - embed_dropout=config_json["embed_dropout"], - attention_dropout=config_json["attn_dropout"], - ) - print(f"Building PyTorch model from configuration: {config}") - model = GPTNeoForCausalLM(config) - - # Load weights from tf checkpoint - load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained mesh-tf model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py deleted file mode 100644 index 2625701c1a75..000000000000 --- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright 2022 The HuggingFace Inc. team and the AI-Sweden team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT-SW3 megatron checkpoints to pytorch""" - -import argparse -import os -from os.path import isfile - -import torch - -from transformers import GPT2Config - - -def recursive_print(name, val, spaces=0): - # Format the message. - if name is None: - msg = None - else: - fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" - msg = fmt.format(name) - - # Print and recurse (if needed). - if isinstance(val, dict): - if msg is not None: - print(msg) - for k in val.keys(): - recursive_print(k, val[k], spaces + 2) - elif isinstance(val, torch.Tensor): - print(msg, ":", val.size()) - else: - print(msg, ":", val) - - -def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size): - # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] - # for compatibility with later versions of NVIDIA Megatron-LM. - # The inverse operation is performed inside Megatron-LM to read checkpoints: - # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 - # If param is the weight tensor of the self-attention block, the returned tensor - # will have to be transposed one more time to be read by HuggingFace GPT2. - input_shape = param.size() - # other versions store [num_heads * num_splits * hidden_size, :] - saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 1).contiguous() - param = param.view(*input_shape) - return param - - -def convert_megatron_checkpoint(sd_megatron, config): - """ - Converts a Megatron checkpoint to a HuggingFace GPT-SW3 checkpoint. - """ - n_positions = config.n_positions - layers = config.n_layer - vocab_size = config.vocab_size - heads = config.n_head - hidden_size_per_head = config.n_embd // config.n_head - - word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :] - sd_hf = { - "transformer.wte.weight": word_embeddings, - "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"], - "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"], - "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"], - } - - pf = "model.language_model.encoder.layers." - for i in range(layers): - causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool)) - causal_mask = causal_mask.view(1, 1, n_positions, n_positions) - sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask - sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16) - - sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"] - - val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"] - val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous() - - val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"] - val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2 - - sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"] - sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1) - sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"] - - # For LM head, transformers' wants the matrix to weight embeddings. - sd_hf["lm_head.weight"] = word_embeddings - - return sd_hf - - -def copy_config(config_hf, config_megatron): - """Copy the config from Megatron to hf.""" - config_hf.vocab_size = 64000 - config_hf.n_positions = config_megatron["encoder_seq_length"] - config_hf.n_embd = config_megatron["hidden_size"] - config_hf.n_layer = config_megatron["num_layers"] - config_hf.n_head = config_megatron["num_attention_heads"] - config_hf.n_inner = config_megatron["ffn_hidden_size"] - config_hf.activation_function = "gelu" - config_hf.resid_pdrop = 0.1 - config_hf.embd_pdrop = 0.1 - config_hf.attn_pdrop = 0.1 - config_hf.layer_norm_epsilon = config_megatron["layernorm_epsilon"] # 1e-5 - config_hf.initializer_range = config_megatron["init_method_std"] # 0.02 - config_hf.apply_query_key_layer_scaling = config_megatron["apply_query_key_layer_scaling"] # True - config_hf.normalize_attention_scores = True - config_hf.use_cache = True - - # This identifies the 6.7B (7B) model which uses a different tokenizer - if config_megatron["hidden_size"] == 4096: - config_hf.bos_token_id = 1 # <|endoftext|> - config_hf.eos_token_id = 1 # <|endoftext|> - config_hf.pad_token_id = 0 # - else: - config_hf.bos_token_id = 2 # - config_hf.eos_token_id = 3 # <|endoftext|> - config_hf.pad_token_id = 0 # - - return config_hf - - -def main(args): - print(args) - - checkpoint_path = args.checkpoint_path - save_path = args.save_path - if isfile(checkpoint_path): - raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}") - - # Load the model. - checkpoint = torch.load(checkpoint_path, map_location="cpu") - - # Load the config. - config_megatron = checkpoint["hyper_parameters"]["cfg"] - config_hf = GPT2Config() - config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron) - config_hf.architectures = ["GPT2LMHeadModel"] - - sd_megatron = checkpoint["state_dict"] - - # Convert. - print("Converting") - sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf) - - # Print the structure of converted state dict. - if args.print_checkpoint_structure: - recursive_print(None, sd_hf) - - config_hf.tokenizer_class = "GPTSw3Tokenizer" - - # Store the config to file. - print("Saving config") - config_hf.save_pretrained(save_path) - - # Store the state_dict to file. - output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin") - print(f'Saving checkpoint to "{output_checkpoint_file}"') - torch.save(sd_hf, output_checkpoint_file) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--checkpoint_path", - type=str, - required=True, - help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000", - ) - parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf") - parser.add_argument("--print-checkpoint-structure", action="store_true") - _args = parser.parse_args() - main(_args) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py deleted file mode 100644 index ac8e82bfd825..000000000000 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ /dev/null @@ -1,491 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Grounding DINO checkpoints from the original repository. - -URL: https://github.com/IDEA-Research/GroundingDINO""" - -import argparse - -import requests -import torch -from PIL import Image -from torchvision import transforms as T - -from transformers import ( - AutoTokenizer, - GroundingDinoConfig, - GroundingDinoForObjectDetection, - GroundingDinoImageProcessor, - GroundingDinoProcessor, - SwinConfig, -) - - -IMAGENET_MEAN = [0.485, 0.456, 0.406] -IMAGENET_STD = [0.229, 0.224, 0.225] - - -def get_grounding_dino_config(model_name): - if "tiny" in model_name: - window_size = 7 - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - image_size = 224 - elif "base" in model_name: - window_size = 12 - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - image_size = 384 - else: - raise ValueError("Model not supported, only supports base and large variants") - - backbone_config = SwinConfig( - window_size=window_size, - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - num_heads=num_heads, - out_indices=[2, 3, 4], - ) - - config = GroundingDinoConfig(backbone_config=backbone_config) - - return config - - -def create_rename_keys(state_dict, config): - rename_keys = [] - # fmt: off - ########################################## VISION BACKBONE - START - # patch embedding layer - rename_keys.append(("backbone.0.patch_embed.proj.weight", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.patch_embed.proj.bias", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.patch_embed.norm.weight", - "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.patch_embed.norm.bias", - "model.backbone.conv_encoder.model.embeddings.norm.bias")) - - for layer, depth in enumerate(config.backbone_config.depths): - for block in range(depth): - # layernorms - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) - # attention - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermediate - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) - - # output - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) - - # downsample - if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) - - for out_indice in config.backbone_config.out_indices: - # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) - - ########################################## VISION BACKBONE - END - - ########################################## ENCODER - START - deformable_key_mappings = { - 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', - 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', - 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', - 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', - 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', - 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', - 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', - 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', - 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', - 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', - 'linear1.weight': 'deformable_layer.fc1.weight', - 'linear1.bias': 'deformable_layer.fc1.bias', - 'linear2.weight': 'deformable_layer.fc2.weight', - 'linear2.bias': 'deformable_layer.fc2.bias', - 'norm2.weight': 'deformable_layer.final_layer_norm.weight', - 'norm2.bias': 'deformable_layer.final_layer_norm.bias', - } - text_enhancer_key_mappings = { - 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', - 'linear1.weight': 'text_enhancer_layer.fc1.weight', - 'linear1.bias': 'text_enhancer_layer.fc1.bias', - 'linear2.weight': 'text_enhancer_layer.fc2.weight', - 'linear2.bias': 'text_enhancer_layer.fc2.bias', - 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', - 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', - 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', - 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', - } - fusion_key_mappings = { - 'gamma_v': 'fusion_layer.vision_param', - 'gamma_l': 'fusion_layer.text_param', - 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', - 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', - 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', - 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', - 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', - 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', - 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', - 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', - 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', - 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', - 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', - 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', - 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', - 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', - 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', - 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', - } - for layer in range(config.encoder_layers): - # deformable - for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # text enhance - for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # fusion layers - for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - ########################################## ENCODER - END - - ########################################## DECODER - START - key_mappings_decoder = { - 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', - 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', - 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', - 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', - 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', - 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', - 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', - 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', - 'norm1.weight': 'encoder_attn_layer_norm.weight', - 'norm1.bias': 'encoder_attn_layer_norm.bias', - 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', - 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', - 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', - 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', - 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', - 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', - 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', - 'norm2.weight': 'self_attn_layer_norm.weight', - 'norm2.bias': 'self_attn_layer_norm.bias', - 'linear1.weight': 'fc1.weight', - 'linear1.bias': 'fc1.bias', - 'linear2.weight': 'fc2.weight', - 'linear2.bias': 'fc2.bias', - 'norm3.weight': 'final_layer_norm.weight', - 'norm3.bias': 'final_layer_norm.bias', - } - for layer_num in range(config.decoder_layers): - source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' - target_prefix_decoder = f'model.decoder.layers.{layer_num}.' - - for source_name, target_name in key_mappings_decoder.items(): - rename_keys.append((source_prefix_decoder + source_name, - target_prefix_decoder + target_name)) - ########################################## DECODER - END - - ########################################## Additional - START - for layer_name, params in state_dict.items(): - #### TEXT BACKBONE - if "bert" in layer_name: - rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE - if "input_proj" in layer_name: - rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE - if "feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection"))) - #### DECODER REFERENCE POINT HEAD - if "transformer.decoder.ref_point_head" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", - "model.decoder.reference_points_head"))) - #### DECODER BBOX EMBED - if "transformer.decoder.bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", - "model.decoder.bbox_embed"))) - if "transformer.enc_output" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) - - if "transformer.enc_out_bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", - "model.encoder_output_bbox_embed"))) - - rename_keys.append(("transformer.level_embed", "model.level_embed")) - rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) - rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) - rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) - ########################################## Additional - END - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v_encoder(state_dict, config): - ########################################## VISION BACKBONE - START - embed_dim = config.backbone_config.embed_dim - for layer, depth in enumerate(config.backbone_config.depths): - hidden_size = embed_dim * 2**layer - for block in range(depth): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" - ] = in_proj_weight[:hidden_size, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" - ] = in_proj_bias[:hidden_size] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" - ] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" - ] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" - ] = in_proj_weight[-hidden_size:, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" - ] = in_proj_bias[-hidden_size:] - ########################################## VISION BACKBONE - END - - -def read_in_q_k_v_text_enhancer(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.encoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[ - :hidden_size, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[ - -hidden_size:, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[ - -hidden_size: - ] - - -def read_in_q_k_v_decoder(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.decoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] - - # read in weights + bias of cross-attention - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias") - - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -def preprocess_caption(caption: str) -> str: - result = caption.lower().strip() - if result.endswith("."): - return result - return result + "." - - -@torch.no_grad() -def convert_grounding_dino_checkpoint(args): - model_name = args.model_name - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - verify_logits = args.verify_logits - - checkpoint_mapping = { - "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", - "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth", - } - # Define default GroundingDino configuation - config = get_grounding_dino_config(model_name) - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - for name, param in original_state_dict.items(): - print(name, param.shape) - - # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(original_state_dict, config) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - read_in_q_k_v_encoder(new_state_dict, config) - read_in_q_k_v_text_enhancer(new_state_dict, config) - read_in_q_k_v_decoder(new_state_dict, config) - - # Load HF model - model = GroundingDinoForObjectDetection(config) - model.eval() - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - # Load and process test image - image = prepare_img() - transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) - original_pixel_values = transforms(image).unsqueeze(0) - - image_processor = GroundingDinoImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer) - - text = "a cat" - inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") - - assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) - - if verify_logits: - # Running forward - with torch.no_grad(): - outputs = model(**inputs) - - print(outputs.logits[0, :3, :3]) - - expected_slice = torch.tensor( - [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] - ) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub(f"EduardoPacheco/{model_name}") - processor.push_to_hub(f"EduardoPacheco/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="grounding-dino-tiny", - type=str, - choices=["grounding-dino-tiny", "grounding-dino-base"], - help="Name of the GroundingDino model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - args = parser.parse_args() - convert_grounding_dino_checkpoint(args) diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py deleted file mode 100644 index 059f10f6129b..000000000000 --- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert GroupViT checkpoints from the original repository. - -URL: https://github.com/NVlabs/GroupViT -""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel - - -def rename_key(name): - # vision encoder - if "img_encoder.pos_embed" in name: - name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings") - if "img_encoder.patch_embed.proj" in name: - name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection") - if "img_encoder.patch_embed.norm" in name: - name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm") - if "img_encoder.layers" in name: - name = name.replace("img_encoder.layers", "vision_model.encoder.stages") - if "blocks" in name and "res" not in name: - name = name.replace("blocks", "layers") - if "attn" in name and "pre_assign" not in name: - name = name.replace("attn", "self_attn") - if "proj" in name and "self_attn" in name and "text" not in name: - name = name.replace("proj", "out_proj") - if "pre_assign_attn.attn.proj" in name: - name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj") - if "norm1" in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "pre_assign" not in name: - name = name.replace("norm2", "layer_norm2") - if "img_encoder.norm" in name: - name = name.replace("img_encoder.norm", "vision_model.layernorm") - # text encoder - if "text_encoder.token_embedding" in name: - name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding") - if "text_encoder.positional_embedding" in name: - name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight") - if "text_encoder.transformer.resblocks." in name: - name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "text_encoder" in name: - name = name.replace("text_encoder", "text_model") - if "ln_final" in name: - name = name.replace("ln_final", "final_layer_norm") - # projection layers - if "img_projector.linear_hidden." in name: - name = name.replace("img_projector.linear_hidden.", "visual_projection.") - if "img_projector.linear_out." in name: - name = name.replace("img_projector.linear_out.", "visual_projection.3.") - if "text_projector.linear_hidden" in name: - name = name.replace("text_projector.linear_hidden", "text_projection") - if "text_projector.linear_out" in name: - name = name.replace("text_projector.linear_out", "text_projection.3") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - # weights and biases of the key, value and query projections of vision encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - stage_num, layer_num = int(key_split[2]), int(key_split[4]) - dim = config.vision_config.hidden_size - if "weight" in key: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.weight" - ] = val[:dim, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.bias" - ] = val[:dim] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.bias" - ] = val[-dim:] - elif "in_proj" in key: - # weights and biases of the key, value and query projections of text encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - if "weight" in key: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - # squeeze if necessary - if ( - "text_projection.0" in new_name - or "text_projection.3" in new_name - or "visual_projection.0" in new_name - or "visual_projection.3" in new_name - ): - orig_state_dict[new_name] = val.squeeze_() - else: - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_groupvit_checkpoint( - checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False -): - """ - Copy/paste/tweak model's weights to the Transformers design. - """ - config = GroupViTConfig() - model = GroupViTModel(config).eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - new_state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - assert missing_keys == ["text_model.embeddings.position_ids"] - assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0) - - # verify result - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - image = prepare_img() - inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - if model_name == "groupvit-gcc-yfcc": - expected_logits = torch.tensor([[13.3523, 6.3629]]) - elif model_name == "groupvit-gcc-redcaps": - expected_logits = torch.tensor([[16.1873, 8.6230]]) - else: - raise ValueError(f"Model name {model_name} not supported.") - assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3) - - processor.save_pretrained(pytorch_dump_folder_path) - model.save_pretrained(pytorch_dump_folder_path) - print("Successfully saved processor and model to", pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing to the hub...") - processor.push_to_hub(model_name, organization="nielsr") - model.push_to_hub(model_name, organization="nielsr") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model." - ) - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint") - parser.add_argument( - "--model_name", - default="groupvit-gccy-fcc", - type=str, - help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub using the provided `model_name`.", - ) - args = parser.parse_args() - - convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py deleted file mode 100644 index eed27645b344..000000000000 --- a/src/transformers/models/hiera/convert_hiera_to_hf.py +++ /dev/null @@ -1,369 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hiera checkpoints from the original repository. - -URL: https://github.com/facebookresearch/hiera -""" - -import argparse -import json -import math -from typing import Dict, Tuple - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool): - rename_keys = [] - # fmt: off - num_stages = len(config.depths) - # embedding dimensions for input and stages - dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)] - - global_layer_idx = 0 - for stage_idx in range(num_stages): - dim_in = dims[stage_idx] - dim_out = dims[stage_idx + 1] - for layer_idx in range(config.depths[stage_idx]): - rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias")) - - # projection layer only for the first layer of each stage boundary (except the first stage) - if dim_out != dim_in and layer_idx == 0: - rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias")) - - global_layer_idx += 1 - - # projection layer + position embeddings - rename_keys.extend( - [ - ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias") - ] - ) - - rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings")) - - if base_model: - # layernorm + pooler - rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")]) - # if just the base model, we should remove "hiera" from all keys that start with "hiera" - rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys] - elif mae_model: - rename_keys.extend( - [ - ("encoder_norm.weight", "encoder_norm.weight"), - ("encoder_norm.bias", "encoder_norm.bias"), - ("mask_token", "decoder.mask_token"), - ("decoder_pos_embed", "decoder.decoder_position_embeddings"), - ("decoder_norm.weight", "decoder.decoder_norm.weight"), - ("decoder_norm.bias", "decoder.decoder_norm.bias"), - ("decoder_pred.weight", "decoder.decoder_pred.weight"), - ("decoder_pred.bias", "decoder.decoder_pred.bias"), - ("decoder_embed.weight", "decoder.decoder_embeddings.weight"), - ("decoder_embed.bias", "decoder.decoder_embeddings.bias") - ] - ) - for i in range(config.decoder_depth): - rename_keys.extend( - [ - (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"), - (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"), - (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"), - (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"), - (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"), - (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"), - (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"), - (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"), - (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"), - (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"), - (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"), - (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"), - ] - ) - for i in range(config.num_query_pool): - rename_keys.extend( - [ - (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"), - (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias") - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "hiera.pooler.layernorm.weight"), - ("norm.bias", "hiera.pooler.layernorm.bias"), - ("head.projection.weight", "classifier.weight"), - ("head.projection.bias", "classifier.bias"), - ] - ) - # fmt: on - return rename_keys - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.projection.weight", "head.projection.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]: - repo_id = "huggingface/label-files" - - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - num_labels = len(id2label) - - return id2label, label2id, num_labels - - -def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig: - if model_name == "hiera-tiny-224": - config = HieraConfig(depths=[1, 2, 7, 2]) - elif model_name == "hiera-small-224": - config = HieraConfig(depths=[1, 2, 11, 2]) - elif model_name == "hiera-base-224": - config = HieraConfig() - elif model_name == "hiera-base-plus-224": - config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16]) - elif model_name == "hiera-large-224": - config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4]) - elif model_name == "hiera-huge-224": - config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4]) - else: - raise ValueError(f"Unrecognized model name: {model_name}") - - if base_model: - pass - elif mae_model: - config.num_query_pool = 2 - config.decoder_hidden_size = 512 - config.decoder_depth = 8 - config.decoder_num_heads = 16 - # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles - config.mask_ratio = 0.6 - else: - id2label, label2id, num_labels = get_labels_for_classifier(model_name) - config.id2label = id2label - config.label2id = label2id - config.num_labels = num_labels - - return config - - -@torch.no_grad() -def convert_hiera_checkpoint(args): - model_name = args.model_name - base_model = args.base_model - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - mae_model = args.mae_model - - config = get_hiera_config(model_name, base_model, mae_model) - - # Load original hiera model - original_model_name = model_name.replace("-", "_") - original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name - - original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k" - - original_model = torch.hub.load( - "facebookresearch/hiera", - model=original_model_name, - pretrained=True, - checkpoint=original_checkpoint_name, - ) - - original_model.eval() - original_state_dict = original_model.state_dict() - # Don't need to remove head for MAE because original implementation doesn't have it on MAE - if base_model: - remove_classification_head_(original_state_dict) - - # # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(config, base_model, mae_model) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - - # Load HF hiera model - if base_model: - model = HieraModel(config) - elif mae_model: - model = HieraForPreTraining(config) - else: - model = HieraForImageClassification(config) - - model.eval() - - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - input_image = prepare_img() - - original_image_preprocessor = transforms.Compose( - [ - transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - - image_processor = BitImageProcessor( - image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256} - ) - inputs = image_processor(images=input_image, return_tensors="pt") - - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - - input_image = prepare_img() - - inputs = image_processor(images=input_image, return_tensors="pt") - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) - print("Pixel values look good!") - print(f"{inputs.pixel_values[0, :3, :3, :3]=}") - - # If is MAE we pass a noise to generate a random mask - mask_spatial_shape = [ - i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size) - ] - num_windows = math.prod(mask_spatial_shape) - torch.manual_seed(2) - noise = torch.rand(1, num_windows) - outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs) - # original implementation returns logits.softmax(dim=-1) - - if base_model: - expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True) - expected_last_hidden = expected_intermediates[-1] - batch_size, _, _, hidden_dim = expected_last_hidden.shape - expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim) - assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3) - print("Base Model looks good as hidden states match original implementation!") - print(f"{outputs.last_hidden_state[0, :3, :3]=}") - elif mae_model: - # get mask from noise to be able to compare outputs - mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise) - expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool()) - assert torch.allclose(outputs.loss, expected_loss, atol=1e-3) - print("MAE Model looks good as loss matches original implementation!") - else: - expected_prob = original_model(expected_pixel_values) - assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3) - print("Classifier looks good as probs match original implementation") - print(f"{outputs.logits[:, :5]=}") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - hub_name = model_name - if base_model: - hub_name = model_name - elif mae_model: - hub_name = f"{model_name}-mae" - else: - hub_name = f"{model_name}-in1k" - repo_id = f"EduardoPacheco/{hub_name}" - print(f"Pushing model and processor for {model_name} to hub at {repo_id}") - model.push_to_hub(repo_id) - image_processor.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default="hiera-tiny-224", - type=str, - choices=[ - "hiera-tiny-224", - "hiera-small-224", - "hiera-base-224", - "hiera-base-plus-224", - "hiera-large-224", - "hiera-huge-224", - ], - help="Name of the Hiera model you'd like to convert.", - ) - parser.add_argument( - "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--verify-logits", - action="store_true", - help="Whether or not to verify the logits against the original implementation.", - ) - parser.add_argument( - "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - parser.add_argument( - "--base-model", - action="store_true", - help="Whether to only convert the base model (no projection head weights).", - ) - parser.add_argument( - "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining." - ) - - args = parser.parse_args() - convert_hiera_checkpoint(args) diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index f5914f35c546..000000000000 --- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,222 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch -from s3prl.hub import distilhubert - -from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = mapped_key - - if key in name: - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -def convert_config(model): - config = HubertConfig() - fs_config = model.config - - config.activation_dropout = fs_config.activation_dropout - config.apply_spec_augment = False - config.attention_dropout = fs_config.attention_dropout - config.conv_bias = False - conv_layers = eval(fs_config.extractor_conv_feature_layers) - config.conv_dim = [x[0] for x in conv_layers] - config.conv_kernel = [x[1] for x in conv_layers] - config.conv_stride = [x[2] for x in conv_layers] - config.feat_extract_activation = "gelu" - config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" - config.feat_proj_layer_norm = False - config.feat_proj_dropout = 0.0 - config.final_dropout = 0.0 - config.hidden_act = fs_config.activation_fn - config.hidden_dropout = fs_config.dropout - config.hidden_size = fs_config.encoder_embed_dim - config.initializer_range = 0.02 - config.intermediate_size = fs_config.encoder_ffn_embed_dim - config.layer_norm_eps = 1e-5 - config.layerdrop = 0.0 - config.num_attention_heads = fs_config.encoder_attention_heads - config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups - config.num_conv_pos_embeddings = fs_config.conv_pos - config.num_feat_extract_layers = len(conv_layers) - config.num_hidden_layers = fs_config.encoder_layers - - return config - - -@torch.no_grad() -def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - model = distilhubert().model.model - - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = convert_config(model) - model = model.eval() - - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=False, - return_attention_mask=False, - ) - hf_model = HubertModel(config) - - recursively_load_weights(model, hf_model) - - feature_extractor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 4966340493f3..000000000000 --- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse -import json -import os - -import fairseq -import torch -from fairseq.data import Dictionary - -from transformers import ( - HubertConfig, - HubertForCTC, - HubertModel, - Wav2Vec2CTCTokenizer, - Wav2Vec2FeatureExtractor, - Wav2Vec2Processor, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm", - "encoder.pos_conv.1": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "w2v_model.layer_norm": "feature_projection.layer_norm", - "w2v_encoder.proj": "lm_head", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model, is_finetuned): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key - - if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned): - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -@torch.no_grad() -def convert_hubert_checkpoint( - checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = HubertConfig() - - if is_finetuned: - if dict_path: - target_dict = Dictionary.load(dict_path) - - # important change bos & pad token id since CTC symbol is and - # not as in fairseq - config.bos_token_id = target_dict.pad_index - config.pad_token_id = target_dict.bos_index - config.eos_token_id = target_dict.eos_index - config.vocab_size = len(target_dict.symbols) - vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") - if not os.path.isdir(pytorch_dump_folder_path): - logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path)) - return - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - with open(vocab_path, "w", encoding="utf-8") as vocab_handle: - json.dump(target_dict.indices, vocab_handle) - tokenizer = Wav2Vec2CTCTokenizer( - vocab_path, - unk_token=target_dict.unk_word, - pad_token=target_dict.pad_word, - bos_token=target_dict.bos_word, - eos_token=target_dict.eos_word, - word_delimiter_token="|", - do_lower_case=False, - ) - return_attention_mask = True if config.feat_extract_norm == "layer" else False - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=True, - return_attention_mask=return_attention_mask, - ) - processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) - processor.save_pretrained(pytorch_dump_folder_path) - - hf_wav2vec = HubertForCTC(config) - else: - hf_wav2vec = HubertModel(config) - - if is_finetuned: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])} - ) - else: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path]) - - model = model[0].eval() - - recursively_load_weights(model, hf_wav2vec, is_finetuned) - - hf_wav2vec.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not" - ) - args = parser.parse_args() - convert_hubert_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned - ) diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index ff15b90088af..000000000000 --- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch - -from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SUPPORTED_MODELS = ["UtteranceLevel"] - - -@torch.no_grad() -def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): - """ - Copy/paste/tweak model's weights to transformers design. - """ - checkpoint = torch.load(checkpoint_path, map_location="cpu") - if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS: - raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}") - - downstream_dict = checkpoint["Downstream"] - - hf_congfig = HubertConfig.from_pretrained(config_path) - hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig) - hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - base_model_name, return_attention_mask=True, do_normalize=False - ) - - if hf_congfig.use_weighted_layer_sum: - hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] - - hf_model.projector.weight.data = downstream_dict["projector.weight"] - hf_model.projector.bias.data = downstream_dict["projector.bias"] - hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"] - hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"] - - hf_feature_extractor.save_pretrained(model_dump_path) - hf_model.save_pretrained(model_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model." - ) - parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.") - parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.") - args = parser.parse_args() - convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path) diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py deleted file mode 100644 index ea44ee11e58c..000000000000 --- a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy - -import torch -from accelerate import init_empty_weights - -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - Idefics2Config, - Idefics2ForConditionalGeneration, - Idefics2ImageProcessor, - Idefics2Processor, - MistralConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.perceiver_resampler": "model.connector.perceiver_resampler", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - return new_state_dict - - -def merge_weights(state_dict): - new_state_dict = copy.deepcopy(state_dict) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - assert weight in state_dict, f"Weight {weight} is missing in the state dict" - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [state_dict[weight]] - else: - new_state_dict[new_weight_name].append(state_dict[weight]) - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - if checkpoint == "HuggingFaceM4/idefics2": - # We load the config then recreate to use the text_config - config = AutoConfig.from_pretrained(checkpoint) - text_config = MistralConfig( - vocab_size=config.vocab_size + config.additional_vocab_size, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - num_key_value_heads=config.num_key_value_heads, - hidden_act=config.hidden_act, - max_position_embeddings=config.max_position_embeddings, - initializer_range=config.initializer_range, - rms_norm_eps=config.rms_norm_eps, - tie_word_embeddings=config.tie_word_embeddings, - rope_theta=config.rope_theta, - sliding_window=config.sliding_window, - attention_dropout=config.attention_dropout, - pad_token_id=config.pad_token_id, - bos_token_id=config.bos_token_id, - eos_token_id=config.eos_token_id, - ) - perceiver_config = config.perceiver_config.to_dict() - config = Idefics2Config( - text_config=text_config.to_dict(), - vision_config=config.vision_config, - perceiver_config=perceiver_config, - use_cache=config.use_cache, - image_token_id=config.image_token_id, - tie_word_embeddings=config.tie_word_embeddings, - ) - return config - - return AutoConfig.from_pretrained(checkpoint) - - -def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True) - # The original model doesn't use the idefics2 processing objects - image_seq_len = original_model.config.perceiver_config.resampler_n_latents - image_processor = Idefics2ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics2Processor( - image_processor=image_processor, - tokenizer=tokenizer, - image_seq_len=image_seq_len, - ) - state_dict = original_model.state_dict() - state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - state_dict = merge_weights(state_dict) - - config = get_config(original_model_id) - - with init_empty_weights(): - model = Idefics2ForConditionalGeneration(config) - - model.load_state_dict(state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py deleted file mode 100644 index 204104a58b30..000000000000 --- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download - -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - Idefics3Config, - Idefics3ForConditionalGeneration, - Idefics3ImageProcessor, - Idefics3Processor, - LlamaConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - -WEIGHTS_TO_DROP = ( - # The original model had a vision head, but this is never used - "model.vision_model.head", -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - old_state_dict_keys = set(state_dict.keys()) - - # Flattened list of weights to merge. We keep these in the original state dict to merge them later - original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]] - - # for key, value in state_dict.items(): - for old_key in old_state_dict_keys: - if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP): - state_dict.pop(old_key) - continue - - key = old_key - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - weight = state_dict.pop(old_key) - if key in original_weights_to_merge: - new_state_dict[key] = weight - # Bit of a hack - we need to keep the original weights to merge them later - state_dict[key] = weight - else: - new_state_dict[key] = weight - - return new_state_dict - - -def merge_weights(state_dict, new_state_dict): - old_weight_names = set(state_dict.keys()) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight_to_merge in weights_to_merge: - print(weight_to_merge) - assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict" - - weight = state_dict.pop(weight_to_merge) - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [weight] - else: - new_state_dict[new_weight_name].append(weight) - - old_weight_names.remove(weight_to_merge) - - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - # We load the config then recreate to use the text_config - - # download the config file - filepath = hf_hub_download(repo_id=checkpoint, filename="config.json") - with open(filepath, "r") as f: - config_json = json.load(f) - - # Setup the vision config - vision_config = config_json.pop("vision_config") - vision_config.pop("vision_model_name", None) - if "embed_dim" in vision_config: - vision_config["hidden_size"] = vision_config.pop("embed_dim") - - config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size") - - image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2) - use_cache = config_json.pop("use_cache", True) - tie_word_embeddings = config_json.pop("tie_word_embeddings", True) - scale_factor = config_json.pop("scale_factor", 2) - vocab_size = config_json.pop("vocab_size", 100000) - - # Remove "freeze" params from the config - config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")} - text_config = LlamaConfig(**config_json) - - config = Idefics3Config( - text_config=text_config, - vision_config=vision_config, - use_cache=use_cache, - image_token_id=image_token_id, - tie_word_embeddings=tie_word_embeddings, - scale_factor=scale_factor, - vocab_size=vocab_size, - ) - return config - - -def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained( - original_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16 - ) - # The original model doesn't use the Idefics3 processing objects - image_processor = Idefics3ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics3Processor( - image_processor=image_processor, - tokenizer=tokenizer, - ) - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - new_state_dict = merge_weights(state_dict, new_state_dict) - del state_dict - - config = get_config(original_model_id) - print(config) - - with init_empty_weights(): - model = Idefics3ForConditionalGeneration(config) - - model.load_state_dict(new_state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py deleted file mode 100644 index 5c15a72ff888..000000000000 --- a/src/transformers/models/ijepa/convert_ijepa_to_hf.py +++ /dev/null @@ -1,267 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert IJEPA checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ijepa -""" - -import argparse -import gc -import re -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import ( - IJepaConfig, - IJepaModel, - ViTImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Projection layer + position embeddings - r"pos_embed": r"embeddings.position_embeddings", - r"patch_embed.proj.weight": r"embeddings.patch_embeddings.projection.weight", - r"patch_embed.proj.bias": r"embeddings.patch_embeddings.projection.bias", - - # Encoder layers: Layernorms, Attention, Feedforward layers - r"blocks.(\d+).norm1.weight": r"encoder.layer.\1.layernorm_before.weight", - r"blocks.(\d+).norm1.bias": r"encoder.layer.\1.layernorm_before.bias", - r"blocks.(\d+).attn.proj.weight": r"encoder.layer.\1.attention.output.dense.weight", - r"blocks.(\d+).attn.proj.bias": r"encoder.layer.\1.attention.output.dense.bias", - r"blocks.(\d+).norm2.weight": r"encoder.layer.\1.layernorm_after.weight", - r"blocks.(\d+).norm2.bias": r"encoder.layer.\1.layernorm_after.bias", - r"blocks.(\d+).mlp.fc1.weight": r"encoder.layer.\1.intermediate.dense.weight", - r"blocks.(\d+).mlp.fc1.bias": r"encoder.layer.\1.intermediate.dense.bias", - r"blocks.(\d+).mlp.fc2.weight": r"encoder.layer.\1.output.dense.weight", - r"blocks.(\d+).mlp.fc2.bias": r"encoder.layer.\1.output.dense.bias", - - # Layernorm + pooler - r"norm.weight": r"layernorm.weight", - r"norm.bias": r"layernorm.bias", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: dict = None): - """ - Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary. - - Args: - state_dict_keys (dict): The keys from the state_dict to convert. - - Returns: - dict: A mapping from old keys to new keys. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - - # Apply regex-based mapping - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # Skip the key - continue - new_text = re.sub(pattern, replacement, new_text) - - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - - return output_dict - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_ijepa_config(model_name): - patch_size = int(model_name.split("_")[1][4:]) - config = IJepaConfig(patch_size=patch_size) - if "vith" in model_name: - config.hidden_size = 1280 - config.num_hidden_layers = 32 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 4 - config.intermediate_size = 5120 - if model_name == "ijepa_vith16_1k": - config.image_size = 448 - elif "vitg" in model_name: - config.hidden_size = 1408 - config.num_hidden_layers = 40 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 48 / 11 - config.intermediate_size = 6144 - else: - raise ValueError("Model not supported, only supports huge and giant models.") - return config - - -@torch.no_grad() -def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our IJEPA structure. - """ - - # define default IJEPA configuration - config = get_ijepa_config(model_name) - - checkpoint_mapping = { - "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar", - "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar", - "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar", - "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar", - } - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - # Rename keys - state_dict = original_state_dict.copy() - new_keys = convert_old_keys_to_new_keys(state_dict.keys()) - for old_key, new_key in new_keys.items(): - rename_key(state_dict, old_key, new_key) - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = IJepaModel(config, add_pooling_layer=False).eval() - model.load_state_dict(state_dict) - size = {"height": config.image_size, "width": config.image_size} - image_processor = ViTImageProcessor(size=size) - - if verify_logits: - # Check outputs on an image, prepared by ViTImageProcessor - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - with torch.no_grad(): - outputs = model(pixel_values) - - expected_slices = { - "ijepa_vith14_1k": torch.Tensor( - [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] - ), - "ijepa_vith14_22k": torch.Tensor( - [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]] - ), - "ijepa_vith16_1k": torch.Tensor( - [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]] - ), - "ijepa_vitg16_22k": torch.Tensor( - [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]] - ), - } - - assert torch.allclose( - expected_slices[model_name], - outputs.last_hidden_state[0, :3, :3], - atol=1e-4, - ) - - if output_dir: - Path(output_dir).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {output_dir}") - image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization) - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - - if push_to_hub: - image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - - if output_dir: - del model, state_dict - gc.collect() - print("Reloading the model to check if it's saved correctly.") - IJepaModel.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ijepa_vith14_1k", - type=str, - choices=[ - "ijepa_vith14_1k", - "ijepa_vith14_22k", - "ijepa_vith16_1k", - "ijepa_vitg16_22k", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the model to the đŸ€— Hub.", - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - parser.set_defaults() - args = parser.parse_args() - write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py deleted file mode 100644 index 182d66b9af28..000000000000 --- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI Image GPT checkpoints.""" - -import argparse - -import torch - -from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path): - # Construct configuration depending on size - MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)} - n_embd, n_head, n_layer = MODELS[model_size] # set model hyperparameters - config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head) - model = ImageGPTForCausalLM(config) - - # Load weights from numpy - load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--imagegpt_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint path.", - ) - parser.add_argument( - "--model_size", - default=None, - type=str, - required=True, - help="Size of the model (can be either 'small', 'medium' or 'large').", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_imagegpt_checkpoint_to_pytorch( - args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path - ) diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py deleted file mode 100644 index f8b9c86cfddc..000000000000 --- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py +++ /dev/null @@ -1,303 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBLIP checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipConfig, - InstructBlipForConditionalGeneration, - InstructBlipProcessor, - InstructBlipQFormerConfig, - InstructBlipVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblip-vicuna-7b", - "instructblip-vicuna-13b", - "instructblip-flan-t5-xl", - "instructblip-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblip-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py deleted file mode 100644 index 9b3d508db6ff..000000000000 --- a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBlipVideo checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipProcessor, - InstructBlipVideoConfig, - InstructBlipVideoForConditionalGeneration, - InstructBlipVideoQFormerConfig, - InstructBlipVideoVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipVideoConfig( - vision_config=vision_config, text_config=text_config, qformer_config=qformer_config - ) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipVideoForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblipvideo-vicuna-7b", - "instructblipvideo-vicuna-13b", - "instructblipvideo-flan-t5-xl", - "instructblipvideo-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblipvideo-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 04c7712aa846..000000000000 --- a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,77 +0,0 @@ -import argparse - -from fairseq.checkpoint_utils import load_checkpoint_to_cpu - -from transformers import Kosmos2Config, Kosmos2ForConditionalGeneration - - -KEYS_TO_MODIFY_MAPPING = { - "gpt_model.decoder.output_projection": "text_model.lm_head", - "gpt_model.decoder": "text_model.model", - "img_connector": "image_to_text_projection", - "img_model.visual.class_embedding": "vision_model.model.embeddings.class_embedding", - "img_model.visual.positional_embedding": "vision_model.model.embeddings.position_embedding.weight", - "img_model.visual.conv1": "vision_model.model.embeddings.patch_embedding", - "img_model.visual": "vision_model.model", - "ln_pre": "pre_layrnorm", - "ln_post": "post_layernorm", - "transformer.resblocks": "encoder.layers", - "ts_attn": "self_attn", - "ln_1": "layer_norm1", - "ln_2": "layer_norm2", - "c_fc": "fc1", - "c_proj": "fc2", -} - - -KEYS_TO_IGNORE = [ - # this buffer in the original code is only used to send weights to the desired device - "gpt_model.decoder.embed_positions._float_tensor", - # this weight is never used in the forward in the original KOSMOS-2) - "gpt_model.decoder.self_attn_sope.scale", -] - - -def rename_key(key): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - return key - - -def convert_kosmos2_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path): - state = load_checkpoint_to_cpu(checkpoint_path) - state_dict = state["model"] - state_dict_keys = list(state_dict.keys()) - - config = Kosmos2Config() - # This is necessary to match the results given by the original demo - config.text_config.no_repeat_ngram_size = 3 - model = Kosmos2ForConditionalGeneration(config) - - # convert (by renaming keys) - converted_state_dict = {} - for key in state_dict_keys: - if key in KEYS_TO_IGNORE: - continue - renamed_key = rename_key(key) - converted_state_dict[renamed_key] = state_dict[key] - - # check weight loading - model.load_state_dict(converted_state_dict, strict=True) - # save the result - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--kosmos2_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_kosmos2_checkpoint_to_pytorch(args.kosmos2_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py deleted file mode 100644 index afef3f73de6c..000000000000 --- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py +++ /dev/null @@ -1,180 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert LeViT checkpoints from timm.""" - -import argparse -import json -from collections import OrderedDict -from functools import partial -from pathlib import Path - -import timm -import torch -from huggingface_hub import hf_hub_download - -from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger() - - -def convert_weight_and_push( - hidden_sizes: int, name: str, config: LevitConfig, save_directory: Path, push_to_hub: bool = True -): - print(f"Converting {name}...") - - with torch.no_grad(): - if hidden_sizes == 128: - if name[-1] == "S": - from_model = timm.create_model("levit_128s", pretrained=True) - else: - from_model = timm.create_model("levit_128", pretrained=True) - if hidden_sizes == 192: - from_model = timm.create_model("levit_192", pretrained=True) - if hidden_sizes == 256: - from_model = timm.create_model("levit_256", pretrained=True) - if hidden_sizes == 384: - from_model = timm.create_model("levit_384", pretrained=True) - - from_model.eval() - our_model = LevitForImageClassificationWithTeacher(config).eval() - huggingface_weights = OrderedDict() - - weights = from_model.state_dict() - og_keys = list(from_model.state_dict().keys()) - new_keys = list(our_model.state_dict().keys()) - print(len(og_keys), len(new_keys)) - for i in range(len(og_keys)): - huggingface_weights[new_keys[i]] = weights[og_keys[i]] - our_model.load_state_dict(huggingface_weights) - - x = torch.randn((2, 3, 224, 224)) - out1 = from_model(x) - out2 = our_model(x).logits - - assert torch.allclose(out1, out2), "The model logits don't match the original one." - - checkpoint_name = name - print(checkpoint_name) - - if push_to_hub: - our_model.save_pretrained(save_directory / checkpoint_name) - image_processor = LevitImageProcessor() - image_processor.save_pretrained(save_directory / checkpoint_name) - - print(f"Pushed {checkpoint_name}") - - -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): - filename = "imagenet-1k-id2label.json" - num_labels = 1000 - expected_shape = (1, num_labels) - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - ImageNetPreTrainedConfig = partial(LevitConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) - - names_to_hidden_sizes = { - "levit-128S": 128, - "levit-128": 128, - "levit-192": 192, - "levit-256": 256, - "levit-384": 384, - } - - names_to_config = { - "levit-128S": ImageNetPreTrainedConfig( - hidden_sizes=[128, 256, 384], - num_attention_heads=[4, 6, 8], - depths=[2, 3, 4], - key_dim=[16, 16, 16], - drop_path_rate=0, - ), - "levit-128": ImageNetPreTrainedConfig( - hidden_sizes=[128, 256, 384], - num_attention_heads=[4, 8, 12], - depths=[4, 4, 4], - key_dim=[16, 16, 16], - drop_path_rate=0, - ), - "levit-192": ImageNetPreTrainedConfig( - hidden_sizes=[192, 288, 384], - num_attention_heads=[3, 5, 6], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0, - ), - "levit-256": ImageNetPreTrainedConfig( - hidden_sizes=[256, 384, 512], - num_attention_heads=[4, 6, 8], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0, - ), - "levit-384": ImageNetPreTrainedConfig( - hidden_sizes=[384, 512, 768], - num_attention_heads=[6, 9, 12], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0.1, - ), - } - - if model_name: - convert_weight_and_push( - names_to_hidden_sizes[model_name], model_name, names_to_config[model_name], save_directory, push_to_hub - ) - else: - for model_name, config in names_to_config.items(): - convert_weight_and_push(names_to_hidden_sizes[model_name], model_name, config, save_directory, push_to_hub) - return config, expected_shape - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default=None, - type=str, - help="The name of the model you wish to convert, it must be one of the supported Levit* architecture,", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="levit-dump-folder/", - type=Path, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - parser.add_argument( - "--no-push_to_hub", - dest="push_to_hub", - action="store_false", - help="Do not push model and image processor to the hub", - ) - - args = parser.parse_args() - pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path - pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True) - convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py deleted file mode 100644 index eb2862eb203d..000000000000 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ /dev/null @@ -1,601 +0,0 @@ -# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os -import tempfile -import warnings -from typing import List - -import torch -from tokenizers import AddedToken, processors - -from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast -from transformers.convert_slow_tokenizer import TikTokenConverter - - -try: - from transformers import LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - LlamaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/llama/convert_llama_weights_to_hf.py \ - --input_dir /path/to/downloaded/llama/weights --model_size 1B --llama_version 3.2 --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import LlamaForCausalLM, LlamaTokenizer - -model = LlamaForCausalLM.from_pretrained("/output/path") -tokenizer = LlamaTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). - -If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor: - -```py -from tokenizers import processors -bos = "<|begin_of_text|>" -tokenizer._tokenizers.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single=f"{bos}:0 $A:0", - pair=f"{bos}:0 $A:0 {bos}:1 $B:1", - special_tokens=[ - (bos, tokenizer.encode(bos)), - ], - ), - ] -) -``` -""" - -NUM_SHARDS = { - "1B": 1, - "3B": 1, - "7B": 1, - "8B": 1, - "8Bf": 1, - "7Bf": 1, - "13B": 2, - "13Bf": 2, - "34B": 4, - "30B": 4, - "65B": 8, - "70B": 8, - "70Bf": 8, - "405B": 8, - "405B-MP16": 16, -} - -CONTEXT_LENGTH_FOR_VERSION = {"Guard-3": 131072, "3.2": 131072, "3.1": 131072, "3": 8192, "2": 4096, "1": 2048} - -BOS_ADDED_TOKEN = AddedToken( - "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOS_ADDED_TOKEN = AddedToken( - "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOT_ADDED_TOKEN = AddedToken( - "<|eot_id|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) - -DEFAULT_LLAMA_SPECIAL_TOKENS = { - "3": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] - + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], - "3.1": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], - "3.2": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], - "Guard-3": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], -} - - -def is_llama_3(version): - return version in ["3", "3.1", "3.2", "Guard-3"] - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model( - model_path, - input_base_path, - model_size=None, - safe_serialization=True, - llama_version="1", - vocab_size=None, - num_shards=None, - instruct=False, - push_to_hub=False, -): - print("Converting the model.") - params = read_json(os.path.join(input_base_path, "params.json")) - num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards - params = params.get("model", params) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and not is_llama_3(llama_version): - max_position_embeddings = 16384 - else: - max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] - - if params.get("n_kv_heads", None) is not None: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_key_value_heads_per_shard = num_key_value_heads // num_shards - key_value_dim = dims_per_head * num_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_key_value_heads_per_shard = n_heads_per_shard - key_value_dim = dim - - # permute for sliced rotary - def permute(w, n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - with tempfile.TemporaryDirectory() as tmp_model_path: - print(f"Fetching all parameters from the checkpoint at {input_base_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") - else: - # Sharded - checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) - print("Loading in order:", checkpoint_list) - loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] - param_count = 0 - index_dict = {"weight_map": {}} - for layer_i in range(n_layers): - filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" - if num_shards == 1: - # Unsharded - state_dict = { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - else: - # Sharded - # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share - # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is - # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. - - state_dict = { - f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ - f"layers.{layer_i}.attention_norm.weight" - ].clone(), - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ - f"layers.{layer_i}.ffn_norm.weight" - ].clone(), - } - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view( - n_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_key_value_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(key_value_dim, dim), - num_key_value_heads, - key_value_dim, - dim, - ) - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_key_value_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0 - ) - - state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" - if num_shards == 1: - # Unsharded - state_dict = { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - else: - concat_dim = 0 if is_llama_3(llama_version) else 1 - state_dict = { - "model.norm.weight": loaded[0]["norm.weight"], - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim - ), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0), - } - - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - # Write configs - index_dict["metadata"] = {"total_size": param_count * 2} - write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) - ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 - multiple_of = params["multiple_of"] if "multiple_of" in params else 256 - - if is_llama_3(llama_version): - bos_token_id = 128000 - - if instruct: - eos_token_id = [128001, 128008, 128009] - else: - eos_token_id = 128001 - else: - bos_token_id = 1 - eos_token_id = 2 - - if llama_version in ["3.1", "3.2", "Guard-3"]: - rope_scaling = { - "factor": 32.0 if llama_version == "3.2" else 8.0, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3", - } - else: - rope_scaling = None - - config = LlamaConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=vocab_size, - rope_theta=base, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=True if llama_version in ["3.2"] else False, - ) - - config.save_pretrained(tmp_model_path) - - generation_config = GenerationConfig( - do_sample=True, - temperature=0.6, - top_p=0.9, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - ) - generation_config.save_pretrained(tmp_model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - gc.collect() - - print("Loading the checkpoint in a Llama model.") - model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) - - # Avoid saving this as part of the config. - del model.config._name_or_path - model.config.torch_dtype = torch.float16 - - print("Saving in the Transformers format.") - if push_to_hub: - print("Pushing to the hub.") - model.push_to_hub(model_path, safe_serialization=safe_serialization, private=True, use_temp_dir=True) - else: - print("Saving to disk.") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - - -class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, special_tokens=None, instruct=False, llama_version="3.2", **kwargs): - super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs) - tokenizer = self.converted() - - # References for chat templates in instruct models - templates_for_version = { - "2": ("meta-llama/Llama-2-7b-chat-hf", "f5db02db724555f92da89c216ac04704f23d4590"), - "3": ("meta-llama/Meta-Llama-3-8B-Instruct", "5f0b02c75b57c5855da9ae460ce51323ea669d8a"), - "3.1": ("meta-llama/Llama-3.1-8B-Instruct", "0e9e39f249a16976918f6564b8830bc894c89659"), - "3.2": ("meta-llama/Llama-3.2-1B-Instruct", "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"), - "Guard-3": ("meta-llama/Llama-Guard-3-1B", "acf7aafa60f0410f8f42b1fa35e077d705892029"), - } - - # Add chat_template only if instruct is True. - # Prevents a null chat_template, which triggers - # a parsing warning in the Hub. - additional_kwargs = {} - if instruct or llama_version in ["Guard-3"]: - model_id, revision = templates_for_version.get(llama_version, (None, None)) - if model_id is not None: - from transformers import AutoTokenizer - - t = AutoTokenizer.from_pretrained(model_id, revision=revision) - additional_kwargs["chat_template"] = t.chat_template - - self.converted_tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", - model_input_names=["input_ids", "attention_mask"], - model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version], - clean_up_tokenization_spaces=True, - **additional_kwargs, - ) - self.update_post_processor(self.converted_tokenizer) - # finer special_tokens_map.json - self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN - self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN - - # We can't do this while building the tokenizer because we have no easy access to the bos token id - def update_post_processor(self, tokenizer): - tokenizer._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="<|begin_of_text|> $A", - pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1", - special_tokens=[ - ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")), - ], - ), - ] - ) - - -def write_tokenizer( - tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False, push_to_hub=False -): - print("Converting the tokenizer.") - tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if is_llama_3(llama_version): - tokenizer = Llama3Converter( - input_tokenizer_path, - special_tokens, - instruct, - llama_version, - ).converted_tokenizer - else: - try: - tokenizer = tokenizer_class(input_tokenizer_path) - except Exception: - raise ValueError( - "Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed." - ) - - if push_to_hub: - print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.") - tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True) - else: - print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") - tokenizer.save_pretrained(tokenizer_path) - return tokenizer - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Llama weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--model_size", - default=None, - help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." - ) - # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. - parser.add_argument( - "--llama_version", - choices=["1", "2", "3", "3.1", "3.2", "Guard-3"], - default="1", - type=str, - help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", - ) - parser.add_argument( - "--num_shards", - default=None, - type=int, - help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", - ) - parser.add_argument( - "--special_tokens", - default=None, - type=List[str], - help="The list of special tokens that should be added to the model.", - ) - parser.add_argument( - "--instruct", - action="store_true", - default=False, - help="Whether the model is an instruct model or not. Will affect special tokens and chat template.", - ) - args = parser.parse_args() - if args.model_size is None and args.num_shards is None: - raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") - if args.special_tokens is None: - # no special tokens by default - args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS.get(str(args.llama_version), []) - - spm_path = os.path.join(args.input_dir, "tokenizer.model") - vocab_size = len( - write_tokenizer( - args.output_dir, - spm_path, - llama_version=args.llama_version, - special_tokens=args.special_tokens, - instruct=args.instruct, - push_to_hub=args.push_to_hub, - ) - ) - - if args.model_size != "tokenizer_only": - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - safe_serialization=args.safe_serialization, - llama_version=args.llama_version, - vocab_size=vocab_size, - num_shards=args.num_shards, - instruct=args.instruct, - push_to_hub=args.push_to_hub, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py deleted file mode 100644 index 3582b9772c9c..000000000000 --- a/src/transformers/models/llava/convert_llava_weights_to_hf.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import glob - -import torch -from huggingface_hub import file_exists, hf_hub_download, snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoImageProcessor, - AutoTokenizer, - LlavaConfig, - LlavaForConditionalGeneration, - LlavaProcessor, - SiglipVisionConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/llava/convert_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/llava-v1.5-7b-conv --old_state_dict_id liuhaotian/llava-v1.5-7b - -Example for creating the old state dict file with Python: - - import torch - from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM - - # load model - kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs) - - # load vision tower - model.get_vision_tower().load_model() - - # Save state dict - torch.save(model.state_dict(), "tmp/hf_models/llava-v1.5-7b/model_state_dict.bin") -""" - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # tied wieghts so lm.head is not saved. Let's clone to load state dict - if "lm_head.weight" not in original_state_dict: - original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() - - if "model.image_newline" in original_state_dict: - # not used in the original implementation because "merge_type=flat" - del original_state_dict["model.image_newline"] - return original_state_dict - - -# used only for llava-interlave -# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/llava-next-interleave-qwen-0.5b -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - return new_state_dict - - -def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): - torch.set_default_dtype(torch.float16) - text_config = AutoConfig.from_pretrained(text_model_id) - - tokenizer = AutoTokenizer.from_pretrained(text_model_id) - tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) - if "Qwen" not in text_model_id: # qwen already has a pad token - tokenizer.add_special_tokens({"pad_token": ""}) - - image_processor = AutoImageProcessor.from_pretrained(vision_model_id) - processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - if "siglip" in vision_model_id: - vision_config = SiglipVisionConfig( - hidden_size=1152, - image_size=384, - intermediate_size=4304, - num_attention_heads=16, - num_hidden_layers=26, - patch_size=14, - vision_use_head=False, - ).to_dict() - else: - vision_config = None - - config = LlavaConfig( - text_config=text_config, - vision_config=vision_config, - ) - - # llms-lab interleeave models do not use any selection startegy except for last hidden state - if "Qwen" in text_model_id: - config.image_token_index = 151646 - if "siglip" in vision_model_id: - config.vision_feature_select_strategy = "full" - config.vision_feature_layer = -1 - else: - config.pad_token_id = 32001 - config.image_token_index = 32000 - - with torch.device("meta"): - model = LlavaForConditionalGeneration(config) - - # Some llava variants like microsoft/llava-med-v1.5-mistral-7b use safetensors to store weights - if file_exists(old_state_dict_id, "model_state_dict.bin"): - state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin") - state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True) - else: - state_dict = load_original_state_dict(old_state_dict_id) - - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, strict=True, assign=True) - - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model and pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), - dim=0, - ) - - model.push_to_hub(output_hub_path) - processor.push_to_hub(output_hub_path) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--text_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--vision_model_id", - help="Hub location of the vision model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--old_state_dict_id", - help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", - ) - args = parser.parse_args() - convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py deleted file mode 100644 index 06edc5c9b1ad..000000000000 --- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py +++ /dev/null @@ -1,397 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository. - -URL: https://github.com/haotian-liu/LLaVA/tree/main. - - -The command used to obtain original logits is the following: -python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0 - -Note: logits are tested with torch==2.1.2. -""" - -import argparse -import gc -import glob -import json -from pathlib import Path - -import requests -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download, snapshot_download -from PIL import Image -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoTokenizer, - LlavaNextConfig, - LlavaNextForConditionalGeneration, - LlavaNextImageProcessor, - LlavaNextProcessor, -) - - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", - "language_model.model.image_newline": "image_newline", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value.to(torch.float16) - return new_state_dict - - -def load_image(): - url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): - # load original config - filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") - # read json - with open(filepath) as f: - data = json.load(f) - print(data) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - text_model_id = "mistralai/Mistral-7B-Instruct-v0.2" - image_token_index = 32000 - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - text_model_id = "lmsys/vicuna-7b-v1.5" - image_token_index = 32000 - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - text_model_id = "lmsys/vicuna-13b-v1.5" - image_token_index = 32000 - elif model_id == "liuhaotian/llava-v1.6-34b": - text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" - image_token_index = 64000 - elif model_id == "lmms-lab/llama3-llava-next-8b": - text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" - image_token_index = 128256 - elif model_id == "lmms-lab/llava-next-72b": - text_model_id = "Qwen/Qwen1.5-72B-Chat" - image_token_index = 151646 - elif model_id == "lmms-lab/llava-next-110b": - text_model_id = "Qwen/Qwen1.5-110B-Chat" - image_token_index = 151646 - - vision_model_id = data["mm_vision_tower"] - - torch.set_default_dtype(torch.float16) - text_config = AutoConfig.from_pretrained(text_model_id) - - use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True - tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast) - tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) - - if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"): - # Mistral-7B doesn't have a padding token set yet - tokenizer.add_special_tokens({"pad_token": ""}) - - image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id) - processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor) - - config = LlavaNextConfig( - text_config=text_config.to_dict(), - image_grid_pinpoints=image_processor.image_grid_pinpoints, - use_image_newline_parameter=True, - image_token_index=image_token_index, - ) - - with init_empty_weights(): - model = LlavaNextForConditionalGeneration(config) - - # load original state dict - state_dict = load_original_state_dict(model_id) - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, assign=True) - model.eval() - - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model - # Pad to 64 for performance reasons - # Qwen-based models have extra unused space in the vocab size already, so no need to resize - if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: - pad_shape = 64 - vocab_size = config.text_config.vocab_size - if model_id == "liuhaotian/llava-v1.6-34b": - # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and - num_tokens = vocab_size + 3 - else: - # this one has 2 additional tokens, namely and - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - ( - dist.sample() - for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]) - ) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), - dim=0, - ) - - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Make space so we can load the model properly now. - del state_dict - gc.collect() - - # Load everything back for inference tests in float32 because prev script was written as that - # Though it's mostly loaded in fp16 as original weights are in fp16 - model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto") - processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path) - device = model.device - - # prepare inputs - image = load_image() - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - prompt = "[INST] \nWhat is shown in this image? [/INST]" - elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]: - prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" - elif model_id == "liuhaotian/llava-v1.6-34b": - prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" - elif model_id == "lmms-lab/llama3-llava-next-8b": - prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: - prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" - - inputs = processor(images=image, text=prompt, return_tensors="pt") - - # verify inputs - filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset") - original_pixel_values = torch.load(filepath, map_location="cpu") - assert torch.allclose(original_pixel_values, inputs.pixel_values.half()) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset") - original_input_ids = torch.load(filepath, map_location="cpu") - # replace -200 by image_token_index (since we use token ID = 32000 for the image token) - original_input_ids[original_input_ids == -200] = image_token_index - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() - - elif model_id == "liuhaotian/llava-v1.6-34b": - filepath = hf_hub_download( - repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset" - ) - original_input_ids = torch.load(filepath, map_location="cpu") - # replace -200 by image_token_index - original_input_ids[original_input_ids == -200] = image_token_index - - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() - - image_sizes = torch.tensor([[899, 1024]]) - assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist() - - # verify single forward pass - print("Single forward pass") - with torch.inference_mode(): - inputs = inputs.to(device) - outputs = model(**inputs) - print("Shape of logits:", outputs.logits.shape) - print("First values of logits:", outputs.logits[0, :3, :3]) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - expected_slice = torch.tensor( - [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - expected_slice = torch.tensor( - [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - expected_slice = torch.tensor( - [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-34b": - expected_slice = torch.tensor( - [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llama3-llava-next-8b": - expected_slice = torch.tensor( - [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-next-72b": - # Not yet checked against reference - expected_slice = torch.tensor( - [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-next-110b": - # Not yet checked against reference - expected_slice = torch.tensor( - [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]], - dtype=torch.float32, - device=device, - ) - else: - raise ValueError(f"Model {model_id} not supported") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Logits are ok!") - - # verify generation - output_ids = model.generate( - **inputs, - max_new_tokens=100, - use_cache=True, - ) - - generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip() - - print("Generated text:", repr(generated_text)) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - expected_text = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "' - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V""" - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM" - elif model_id == "liuhaotian/llava-v1.6-34b": - expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-" - elif model_id == "lmms-lab/llama3-llava-next-8b": - expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL' - elif model_id == "lmms-lab/llava-next-72b": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes" - elif model_id == "lmms-lab/llava-next-110b": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a" - else: - raise ValueError(f"Model {model_id} not supported") - - assert generated_text == expected_text - print("Generated text is ok!") - - # verify batched generation - print("Batched generation...") - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - cats_image = Image.open(requests.get(url, stream=True).raw) - - inputs = processor( - images=[image, cats_image], - text=[prompt, prompt], - padding=True, - return_tensors="pt", - ).to(device) - - for k, v in inputs.items(): - print(k, v.shape) - - print("Image sizes:", inputs.image_sizes) - - # make sure image_sizes are the same - # as otherwise batched generation doesn't work - inputs.image_sizes[1] = inputs.image_sizes[0] - - print("Batched generation...") - output_ids = model.generate( - **inputs, - max_new_tokens=20, - use_cache=True, - ) - - outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - print(outputs) - - if push_to_hub: - checkpoint_name = model_id.split("/")[-1] - print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") - model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_id", - help="Hub location of the model to convert", - default="liuhaotian/llava-v1.6-mistral-7b", - choices=[ - "liuhaotian/llava-v1.6-mistral-7b", - "liuhaotian/llava-v1.6-vicuna-7b", - "liuhaotian/llava-v1.6-vicuna-13b", - "liuhaotian/llava-v1.6-34b", - "lmms-lab/llama3-llava-next-8b", - "lmms-lab/llava-next-72b", - "lmms-lab/llava-next-110b", - ], - required=False, - ) - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - - convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py deleted file mode 100644 index aae44eee97a0..000000000000 --- a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert LLaVa-NeXT-Video checkpoints from the original repository. - -URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference -""" - -import argparse -import glob -import json -from pathlib import Path - -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download, snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoTokenizer, - LlavaNextImageProcessor, - LlavaNextVideoConfig, - LlavaNextVideoForConditionalGeneration, - LlavaNextVideoImageProcessor, - LlavaNextVideoProcessor, -) - - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", - "language_model.model.image_newline": "image_newline", -} - -# {{SYSTEM_PROMPT}} USER: \n{{PROMPT}} ASSISTANT:" assistant end with " " -chat_vicuna = ( - "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - "{{ message['content'][0]['text'] }}" - "{% else %}" - "{{ message['role'].upper() + ': '}}" - "{% endif %}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ '\n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] + ' '}}" - "{% endfor %}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ 'ASSISTANT:' }}" - "{% endif %}" -) - -# "[INST] \nWhat is shown in this image? [/INST]" assistant end with " " -chat_mistral = ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}" - "{{ '[INST] ' }}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ '\n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] }}" - "{% endfor %}" - "{{' [/INST]' }}" - "{% elif message['role'] == 'assistant' %}" - r"{{ ' ' + message['content'][0]['text'] + '<\s> '}}" - "{% else %}" - "{{ raise_exception('Only user and assistant roles are supported!') }}" - "{% endif %}" - "{% endfor %}" -) - -# "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" -chat_yi = ( - "{% for message in messages %}" - "{{'<|im_start|>' + message['role'] + '\n'}}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ '\n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] }}" - "{% endfor %}" - "{{'<|im_end|>' + '\n'}}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ '<|im_start|>assistant\n' }}" - "{% endif %}" -) - -model2template = { - "lmms-lab/LLaVA-NeXT-Video-7B-32K": chat_mistral, - "lmms-lab/LLaVA-NeXT-Video-7B": chat_vicuna, - "lmms-lab/LLaVA-NeXT-Video-7B-DPO": chat_vicuna, - "lmms-lab/LLaVA-NeXT-Video-34B": chat_yi, - "lmms-lab/LLaVA-NeXT-Video-34B-DPO": chat_yi, -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value.to(torch.bfloat16) - return new_state_dict - - -def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): - # load original config - filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") - with open(filepath) as f: - data = json.load(f) - print(data) - - if model_id == "lmms-lab/LLaVA-NeXT-Video-7B-32K": - text_model_id = "mistralai/Mistral-7B-Instruct-v0.2" - video_token_index = 32000 - image_token_index = 32001 - overwrite_text_config = {} - elif model_id in ["lmms-lab/LLaVA-NeXT-Video-7B", "lmms-lab/LLaVA-NeXT-Video-7B-DPO"]: - text_model_id = "lmsys/vicuna-7b-v1.5" - video_token_index = 32000 - image_token_index = 32001 - overwrite_text_config = {"factor": 2.0, "type": "linear"} - elif model_id in ["lmms-lab/LLaVA-NeXT-Video-34B", "lmms-lab/LLaVA-NeXT-Video-34B-DPO"]: - text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" - video_token_index = 64000 - image_token_index = 64001 - overwrite_text_config = {} - else: - raise ValueError("Incorrect checkpoint referenced. Text model-id not identified!") - - vision_model_id = data["mm_vision_tower"] - - torch.set_default_dtype(torch.bfloat16) - text_config = AutoConfig.from_pretrained(text_model_id) - text_config = text_config.to_dict() - text_config.update(overwrite_text_config) - - tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True, padding_side="left") - tokenizer.add_tokens(AddedToken("