From fb2a9985fe0f7f10df485c7039074a00fc16383b Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:37:37 +0300 Subject: [PATCH] unpin transformers version (#9606) * unpin transformers Signed-off-by: dimapihtar * guard deprecated imports Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix import guards Signed-off-by: dimapihtar * fix import guards Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * try fixing Signed-off-by: Chen Cui * disable HF tests Signed-off-by: Dmytro Pykhtar * try fixing Signed-off-by: Chen Cui * hard code model lists Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * hard code model lists Signed-off-by: Chen Cui --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Signed-off-by: Chen Cui Signed-off-by: Dmytro Pykhtar Signed-off-by: cuichenx Co-authored-by: dimapihtar Co-authored-by: Chen Cui Co-authored-by: Dmytro Pykhtar Co-authored-by: cuichenx --- .../common/huggingface/huggingface_utils.py | 82 +++++++++++++++++-- requirements/requirements_lightning.txt | 2 +- 2 files changed, 75 insertions(+), 9 deletions(-) diff --git a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py b/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py index cf692e07749d..d8f6936f7126 100644 --- a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py +++ b/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py @@ -16,12 +16,6 @@ from typing import List, Optional from transformers import ( - ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - BERT_PRETRAINED_MODEL_ARCHIVE_LIST, - CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, - ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, AlbertConfig, AutoModel, BertConfig, @@ -41,6 +35,74 @@ __all__ = ["get_huggingface_lm_model", "get_huggingface_pretrained_lm_models_list", "VOCAB_FILE_NAME"] +# Manually specify the model archive lists since these are now removed in HF +# https://github.com/huggingface/transformers/blob/v4.40-release/src/transformers/models/deprecated/_archive_maps.py +ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "albert/albert-base-v1", + "albert/albert-large-v1", + "albert/albert-xlarge-v1", + "albert/albert-xxlarge-v1", + "albert/albert-base-v2", + "albert/albert-large-v2", + "albert/albert-xlarge-v2", + "albert/albert-xxlarge-v2", +] + +BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google-bert/bert-base-uncased", + "google-bert/bert-large-uncased", + "google-bert/bert-base-cased", + "google-bert/bert-large-cased", + "google-bert/bert-base-multilingual-uncased", + "google-bert/bert-base-multilingual-cased", + "google-bert/bert-base-chinese", + "google-bert/bert-base-german-cased", + "google-bert/bert-large-uncased-whole-word-masking", + "google-bert/bert-large-cased-whole-word-masking", + "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", + "google-bert/bert-large-cased-whole-word-masking-finetuned-squad", + "google-bert/bert-base-cased-finetuned-mrpc", + "google-bert/bert-base-german-dbmdz-cased", + "google-bert/bert-base-german-dbmdz-uncased", + "cl-tohoku/bert-base-japanese", + "cl-tohoku/bert-base-japanese-whole-word-masking", + "cl-tohoku/bert-base-japanese-char", + "cl-tohoku/bert-base-japanese-char-whole-word-masking", + "TurkuNLP/bert-base-finnish-cased-v1", + "TurkuNLP/bert-base-finnish-uncased-v1", + "wietsedv/bert-base-dutch-cased", +] +CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "almanach/camembert-base", + "Musixmatch/umberto-commoncrawl-cased-v1", + "Musixmatch/umberto-wikipedia-uncased-v1", +] + +DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "distilbert-base-uncased", + "distilbert-base-uncased-distilled-squad", + "distilbert-base-cased", + "distilbert-base-cased-distilled-squad", + "distilbert-base-german-cased", + "distilbert-base-multilingual-cased", + "distilbert-base-uncased-finetuned-sst-2-english", +] +GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "openai-community/gpt2", + "openai-community/gpt2-medium", + "openai-community/gpt2-large", + "openai-community/gpt2-xl", + "distilbert/distilgpt2", +] +ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "FacebookAI/roberta-base", + "FacebookAI/roberta-large", + "FacebookAI/roberta-large-mnli", + "distilbert/distilroberta-base", + "openai-community/roberta-base-openai-detector", + "openai-community/roberta-large-openai-detector", +] + HUGGINGFACE_MODELS = { "BertModel": { @@ -94,7 +156,9 @@ def get_huggingface_lm_model( - pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, + pretrained_model_name: str, + config_dict: Optional[dict] = None, + config_file: Optional[str] = None, ): """ Returns lm model instantiated with Huggingface @@ -135,7 +199,9 @@ def get_huggingface_lm_model( raise ValueError(f"Use HuggingFace API directly in NeMo for {pretrained_model_name}") -def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]: +def get_huggingface_pretrained_lm_models_list( + include_external: bool = False, +) -> List[str]: """ Returns the list of pretrained HuggingFace language models diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index c7e67d21a693..1b3397f69033 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 pytorch-lightning>2.2.1 torchmetrics>=0.11.0 -transformers>=4.36.0,<=4.40.2 +transformers wandb webdataset>=0.2.86