From 7eac53c217d8f399231f9f390602a7478a5e07c5 Mon Sep 17 00:00:00 2001 From: Dong Hyuk Chang Date: Thu, 25 Jul 2024 14:15:24 -0400 Subject: [PATCH 1/8] Set default Torch version if YY.MM format is not met (#9776) * Torch major and minor versions set to current year and month if YY.MM formatting is not met Signed-off-by: Dong Hyuk Chang * Update nvidia torch version check Signed-off-by: Dong Hyuk Chang * Remove redundant import Signed-off-by: Dong Hyuk Chang * Formatting fix Signed-off-by: Dong Hyuk Chang --------- Signed-off-by: Dong Hyuk Chang Co-authored-by: Dong Hyuk Chang --- .../nlp/models/language_modeling/megatron_base_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 20d532d4764a..d0d239b21637 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -379,8 +379,11 @@ def _enable_nvidia_optimizations(self): # NVIDIA container version check nvidia_torch_version = os.getenv('NVIDIA_PYTORCH_VERSION', None) - # Support DLFW master container - if nvidia_torch_version == 'master': + def is_official_release_version(nvidia_torch_version): + return re.fullmatch("[0-9][0-9]\.[0-9][0-9].*", nvidia_torch_version) # "YY.MM.*" + + # Support DLFW dev container + if not is_official_release_version(nvidia_torch_version): nvidia_torch_version = datetime.now().strftime('%y.%m') if nvidia_torch_version is not None: @@ -389,7 +392,7 @@ def _enable_nvidia_optimizations(self): except Exception: NVIDIA_TORCH_MAJOR = 0 try: - NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1]) + NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1][:2]) except Exception: NVIDIA_TORCH_MINOR = 0 From 54916428c28f45394b6b3714934bcb6138109b16 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Thu, 25 Jul 2024 15:45:19 -0700 Subject: [PATCH 2/8] fix arg name (#9848) * fix arg name Signed-off-by: Sangkug Lym * cleanup Signed-off-by: Sangkug Lym * cleanup Signed-off-by: Sangkug Lym --------- Signed-off-by: Sangkug Lym Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- .../nlp/modules/common/megatron/adapters/parallel_adapters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 8d2d77c55cf2..7167eefda637 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -257,7 +257,7 @@ def __init__( te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.5.0dev") and ( - not self.input_is_parallel and model_parallel_config.tp_comm_disable_qkv + not self.input_is_parallel and getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False) ): # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather # in the forward method is not needed, so set self._sequence_parallel to False From bd185cbb738f01103aa4652b8cca34784df5e2cc Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Thu, 25 Jul 2024 16:32:59 -0700 Subject: [PATCH 3/8] Added defer wgrad support with mcore optim (#9896) * Added defer wgrad support with mcore optim Signed-off-by: Selvaraj Anandaraj * Apply isort and black reformatting Signed-off-by: sanandaraj5597 --------- Signed-off-by: Selvaraj Anandaraj Signed-off-by: sanandaraj5597 Co-authored-by: Selvaraj Anandaraj Co-authored-by: sanandaraj5597 --- .../nlp/models/language_modeling/megatron_gpt_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 1cdee42f580e..41d85d48e497 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -816,7 +816,9 @@ def training_step(self, dataloader_iter): ignore_virtual=True ): if ( - self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt + self.cfg.get('defer_embedding_wgrad_compute', False) + and self.mcore_gpt + and not self.use_mcore_dist_optim ): # Silently ignore the optimization if MCORE is not used module_list = self.get_model_module_list() if len(module_list) > 1: @@ -839,7 +841,9 @@ def training_step(self, dataloader_iter): ignore_virtual=True ): if ( - self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt + self.cfg.get('defer_embedding_wgrad_compute', False) + and self.mcore_gpt + and not self.use_mcore_dist_optim ): # Silently ignore the optimization if MCORE is not used module_list = self.get_model_module_list() if len(module_list) > 1: From fe1625947fbda50b32b914c12aafff86b1cb534f Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Thu, 25 Jul 2024 21:54:35 -0500 Subject: [PATCH 4/8] tutorial fixes (#9907) --- tutorials/llm/mamba/mamba.rst | 61 ++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst index 2ce5ee5f616b..e619e56c3747 100644 --- a/tutorials/llm/mamba/mamba.rst +++ b/tutorials/llm/mamba/mamba.rst @@ -37,18 +37,36 @@ Step-by-step Guide for Fine-Tuning Checkpoints from HuggingFace ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Obtain the desired checkpoint from HuggigFace. +Obtain the desired checkpoint from HuggigFace. The checkpoints below have different arrangement and there are a few preprocessing step for each. + +1. `Repository `__ for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__. + The checkpoint from this repository is located in files tab under ``release/mp_rank_00/model_optim_rng.pt``. The tokenizer is under files tab and is named ``mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model``. You need both of these for conversion to ``.nemo`` checkpoint. + +2. `Repository `__ for the Mamba2 models from the `Transformers are SSMs paper `__. + For checkpoints from this repository, run the following python script to convert the pytorch checkpoint (`pytorch_model.bin` in the HuggingFace model card) to a format similar to the 8b models: + + .. code:: python + + import torch + import os + + ckpt_path = "/path/to/pytorch_model.bin" + pyt_checkpoint = torch.load(ckpt_path) + new_ckpt_path = os.path.join(os.path.dirname(ckpt_path), f"wrapped_{os.path.basename(ckpt_path)}") + + # Save the new checkpoint which will be used as the input to the conversion script + torch.save({"model": pyt_checkpoint}, new_ckpt_path) + + You will use this ``wrapped_pytorch_model.bin`` for the conversion to ``.nemo`` in the next step. -* `Repository `__ for the Mamba2 models from the `Transformers are SSMs paper `__. -* `Repository `__ for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__. Convert the Pytorch Checkpoint to a NeMo Checkpoint ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -1. Get into NVIDIA Container +1. Get into the NVIDIA dev container from `NGC `_, or the 24.07 container (once released). -2. Run the conversion script from . For this conversion script, you should provide the PyTorch state dictionary of the model for ``input_name_or_path``, i.e. this argument only accepts a single ``state_dict``. +2. Run the conversion script from . For this conversion script, you should provide the checkpoint (and tokenizer in the case of 8b models) from the previous step for ``input_name_or_path``. .. code:: bash @@ -56,7 +74,9 @@ Convert the Pytorch Checkpoint to a NeMo Checkpoint --input_name_or_path \ --output_path \ --mamba_ssm_ngroups 8 \ - --precision bf16 + --precision bf16 \ + --tokenizer_path= + * Note: the ``mamba_ssm_ngroups`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper `__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__ (both 8b). @@ -69,7 +89,7 @@ The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ` .. code:: bash - python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \ + CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \ --model_file= \ --target_file= \ --tensor_model_parallel_size=1 \ @@ -79,7 +99,7 @@ The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ` After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_03`` in this example. -* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA `__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``nroups`` parameter in the model architecture should be divisible by TP size. ``nroups`` parameter is 8 for NVIDIA models and 1 for other models in the list. +* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA `__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``mamba_ssm_ngroups`` parameter in the model architecture should be divisible by TP size. ``mamba_ssm_ngroups`` parameter is 8 for NVIDIA models and 1 for other models in the list. Run Fine-Tuning ^^^^^^^^^^^^^^^ @@ -93,21 +113,21 @@ Run Fine-Tuning MBS=4 GBS=128 - TP=2 # According to the saved checkpoint + TP=4 # According to the saved checkpoint SP=True # True only if TP>1 otherwise False SEQ_LEN=2048 - NUM_DEVICES=2 + NUM_DEVICES=8 PATH_TO_NEMO_MODEL= TRAIN_DATASET_PATH= VAL_DATASET_PATH= - CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/" + CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= export NVTE_FUSED_ATTN=1 export NVTE_FLASH_ATTN=0 - torchrun --nproc_per_node=${NUM_DEVICES} + torchrun --nproc_per_node=${NUM_DEVICES} \ /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \ --config-path=${CONFIG_PATH} \ --config-name=${CONFIG_NAME} \ @@ -135,7 +155,6 @@ Run Fine-Tuning model.optim.name="distributed_fused_adam" \ model.data.train_ds.max_seq_length=${SEQ_LEN} \ model.data.validation_ds.max_seq_length=${SEQ_LEN} \ - model.mcore_gpt=True \ model.micro_batch_size=${MBS} \ model.global_batch_size=${GBS} \ model.restore_from_path=${PATH_TO_NEMO_MODEL} \ @@ -144,8 +163,6 @@ Run Fine-Tuning model.optim.lr=5e-6 \ model.optim.sched.min_lr=1e-7 -* Note: The tokenizer for 8b models (Mamba2 8b and MAmba2-Hybrid 8b) can be found in the `HuggingFace repository `__. Download it a set its path to ``TOKENIZER_MODEL`` (the tokenizer model file is under the name of ```mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model```). For other models, set ``TOKENIZER_MODEL=null`` since it will be downloaded from HuggingFace at the time of run. - Evaluating the Fine-Tuned Model ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -156,13 +173,12 @@ Evaluating the Fine-Tuned Model MBS=32 GBS=64 - TP=2 # According to the fine-tuned checkpoint + TP=4 # According to the fine-tuned checkpoint SP=True # True only if TP>1 otherwise False SEQ_LEN=2048 - NUM_DEVICES=2 + NUM_DEVICES=8 PATH_TO_NEMO_MODEL= - TRAIN_DATASET_PATH= - VAL_DATASET_PATH= + TEST_DATASET="[]" CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= @@ -170,12 +186,11 @@ Evaluating the Fine-Tuned Model export NVTE_FUSED_ATTN=1 export NVTE_FLASH_ATTN=0 - TEST_DATASET="[]" CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_generate_config" - MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES} /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \ + torchrun --nproc_per_node=${NUM_DEVICES} /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \ --config-path=${CONFIG_PATH} \ --config-name=${CONFIG_NAME} \ trainer.devices=${NUM_DEVICES} \ @@ -196,11 +211,11 @@ Evaluating the Fine-Tuned Model +model.peft.restore_from_ckpt.checkpoint_dir=False \ +model.peft.restore_from_ckpt.checkpoint_name=False \ model.tensor_model_parallel_size=${TP} \ - model.sequence_parallel=$SP \ model.micro_batch_size=${MBS} \ model.global_batch_size=${GBS} \ model.restore_from_path=${PATH_TO_NEMO_MODEL} \ model.data.test_ds.file_names=${TEST_DATASET} \ + model.data.test_ds.names=["squad"] \ model.data.test_ds.global_batch_size=${GBS} \ model.data.test_ds.micro_batch_size=${MBS} \ model.data.test_ds.tokens_to_generate=30 \ @@ -219,7 +234,7 @@ Evaluating the Fine-Tuned Model Inference ^^^^^^^^^ -For running inference on a Mamba model, one should use ``megatron_mamba_eval.py`` script. For example: +For running inference on a Mamba model, one should use ``megatron_mamba_eval.py`` script. This evaluation script currently requires tensor/model parallel (TP1) of size one. If your checkpoint has TP>1, use the TP conversion step from above and set ``target_tensor_model_parallel_size=1``. The following is an example for using evaluation script: .. code:: bash From 74c2caffdc81e83d67ce2aa5889c6c7c0aab6f72 Mon Sep 17 00:00:00 2001 From: Huu Tuong Tu <83907151+huutuongtu@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:24:48 +0700 Subject: [PATCH 5/8] [TTS][Vietnamese] Add VietnameseCharsTokenizer (#9665) * Update tts_tokenizers.py * Update tokenizer_utils.py * Update test_tts_tokenizers.py * Apply isort and black reformatting Signed-off-by: huutuongtu * Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com) * Update ipa_lexicon.py - Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com) Signed-off-by: XuesongYang --------- Signed-off-by: huutuongtu Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: XuesongYang Co-authored-by: huutuongtu Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: XuesongYang --- .../tokenizers/text_to_speech/ipa_lexicon.py | 29 +++++++++++++- .../text_to_speech/tokenizer_utils.py | 5 +++ .../text_to_speech/tts_tokenizers.py | 38 +++++++++++++++++++ .../text_to_speech/test_tts_tokenizers.py | 13 +++++++ 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py index f4081735eb71..6b7677431fc7 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py +++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py @@ -15,7 +15,9 @@ # fmt: off -SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"] + +SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN"] + DEFAULT_PUNCTUATION = ( ',', '.', '!', '?', '-', @@ -48,6 +50,19 @@ 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ', ), + # ref: https://en.wikipedia.org/wiki/Vietnamese_alphabet + "vi-VN": ( + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z', 'Đ', 'Á', 'À', 'Ã', + 'Ả', 'Ạ', 'Ă', 'Ắ', 'Ằ', 'Ẵ', 'Ẳ', 'Ặ', 'Â', 'Ấ', + 'Ầ', 'Ẫ', 'Ẩ', 'Ậ', 'Ó', 'Ò', 'Õ', 'Ỏ', 'Ọ', 'Ô', + 'Ố', 'Ồ', 'Ỗ', 'Ổ', 'Ộ', 'Ơ', 'Ớ', 'Ờ', 'Ỡ', 'Ở', + 'Ợ', 'É', 'È', 'Ẽ', 'Ẻ', 'Ẹ', 'Ê', 'Ế', 'Ề', 'Ễ', + 'Ể', 'Ệ', 'Ú', 'Ù', 'Ũ', 'Ủ', 'Ụ', 'Ư', 'Ứ', 'Ừ', + 'Ữ', 'Ử', 'Ự', 'Í', 'Ì', 'Ĩ', 'Ỉ', 'Ị', 'Ý', 'Ỳ', + 'Ỹ', 'Ỷ', 'Ỵ', + ), "fr-FR": ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', @@ -104,6 +119,18 @@ 'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ', 'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː', ), + "vi-VN": ( + 'a', 'ə', 'ɛ', 'e', 'i', 'o', 'ɔ', 'u', 'ɨ', + 'b', 'c', 'z', 'j', 'd', 'g', 'h', 'x', 'l', + 'm', 'n', 'ŋ', 'ɲ', 'p', 'f', 'w', 'r', 's', + 'ʃ', 't', 'ʈ', 'ʂ', 'v', 'ʔ', 'ɓ', 'ɗ', 'ɣ', + 'k', 'ʰ', 'ʷ', 'ɕ', 'ʑ', 'ʝ', '̚', '̟', 't͡', + '˧', 'ː', 'ɯ', '̀', '̄', '̌', '̂', 'ˀ', '͡', '˥', + '˩', '̤', '˨', 'ɹ', 'ʲ', '̯', 'ă', 'ə̆', 'ǐ', + '˦', 'æ', 'ɐ', + 'ɜ', 'ɡ', 'ɪ', 'ɬ' 'ɾ', 'ʊ', 'ʌ', 'ʒ', '̃', + '̩', 'θ', 'ᵻ', + ), } GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"] diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py index 542b18186846..c82d4f3cce19 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py @@ -24,6 +24,7 @@ "english_text_preprocessing", "any_locale_text_preprocessing", "spanish_text_preprocessing", + "vietnamese_text_preprocessing", "italian_text_preprocessing", "any_locale_word_tokenize", "english_word_tokenize", @@ -201,3 +202,7 @@ def chinese_text_preprocessing(text: str) -> str: def french_text_preprocessing(text: str) -> str: return text.lower() + + +def vietnamese_text_preprocessing(text: str) -> str: + return text.lower() diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 4998fbba1ac9..6332c91cad46 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -31,6 +31,7 @@ french_text_preprocessing, italian_text_preprocessing, spanish_text_preprocessing, + vietnamese_text_preprocessing, ) from nemo.utils import logging from nemo.utils.decorators import experimental @@ -202,6 +203,43 @@ def __init__( ) +class VietnameseCharsTokenizer(BaseCharsTokenizer): + + _LOCALE = "vi-VN" + _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed") + + def __init__( + self, + chars=_CHARSET_STR, + punct=True, + apostrophe=True, + add_blank_at=None, + pad_with_space=False, + non_default_punct_list=None, + text_preprocessing_func=vietnamese_text_preprocessing, + ): + """Vietnamese grapheme tokenizer. + Args: + punct: Whether to reserve grapheme for basic punctuation or not. + apostrophe: Whether to use apostrophe or not. + add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), + if None then no blank in labels. + pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. + non_default_punct_list: List of punctuation marks which will be used instead default. + text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. By default, it + would keep any word lowercase. + """ + super().__init__( + chars=chars, + punct=punct, + apostrophe=apostrophe, + add_blank_at=add_blank_at, + pad_with_space=pad_with_space, + non_default_punct_list=non_default_punct_list, + text_preprocessing_func=vietnamese_text_preprocessing, + ) + + class GermanCharsTokenizer(BaseCharsTokenizer): _LOCALE = "de-DE" diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py index 2e2f9bdaaf36..2023d31696b1 100644 --- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py +++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py @@ -21,6 +21,7 @@ IPATokenizer, ItalianCharsTokenizer, SpanishCharsTokenizer, + VietnameseCharsTokenizer, ) from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p @@ -124,6 +125,18 @@ def test_spanish_chars_tokenizer(self): assert chars == expected_output assert len(tokens) == len(input_text) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_vietnamese_chars_tokenizer(self): + input_text = "Xin chào các bạn." + expected_output = "xin chào các bạn." + + tokenizer = VietnameseCharsTokenizer() + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output + assert len(tokens) == len(input_text) + @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_french_chars_tokenizer(self): From c81f7cf6cb1234bf51843f8fd192f72c52389407 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Fri, 26 Jul 2024 09:46:30 -0400 Subject: [PATCH 6/8] Integrate TRT-LLM v0.11 (#9705) * fix minor import bug Signed-off-by: Onur Yilmaz * Change imports to catch import level errros Signed-off-by: Onur Yilmaz * Update changed trt-llm apis Signed-off-by: Onur Yilmaz * Gemma working version Signed-off-by: Onur Yilmaz * llama working version Signed-off-by: Onur Yilmaz * gpt support Signed-off-by: Onur Yilmaz * remove old tests Signed-off-by: Onur Yilmaz * add new tests Signed-off-by: Onur Yilmaz * lora fix Signed-off-by: Onur Yilmaz * Add a few more params for trt-llm Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * Add params to the load Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Co-authored-by: oyilmaz-nvidia --- nemo/export/tensorrt_llm.py | 51 ++++++- .../trt_llm/converter/model_converter.py | 21 ++- nemo/export/trt_llm/converter/utils.py | 2 +- nemo/export/trt_llm/tensorrt_llm_build.py | 29 ++-- nemo/export/trt_llm/tensorrt_llm_run.py | 56 +++++++- scripts/deploy/nlp/deploy_triton.py | 12 +- tests/export/run.sh | 11 +- tests/infer_data_path.py | 136 +++--------------- 8 files changed, 169 insertions(+), 149 deletions(-) mode change 100644 => 100755 nemo/export/trt_llm/converter/model_converter.py mode change 100644 => 100755 nemo/export/trt_llm/converter/utils.py mode change 100644 => 100755 nemo/export/trt_llm/tensorrt_llm_build.py diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index b4299dfd8945..08f1e4fe74e6 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -38,16 +38,24 @@ is_nemo_file, load_nemo_model, ) -from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm -from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer -from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_distributed, refit +LOGGER = logging.getLogger("NeMo") + +use_model_opt = True +try: + from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm + from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer + from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint +except Exception as e: + LOGGER.warning(f"Cannot import the Model Optimizer, it will not be available. {type(e).__name__}: {e}") + use_model_opt = False + use_deploy = True try: from nemo.deploy.utils import cast_output, str_ndarray2list -except Exception: +except Exception as e: use_deploy = False @@ -67,8 +75,6 @@ def wrapper(*args, **kwargs): except Exception: use_pytriton = False -LOGGER = logging.getLogger("NeMo") - class TensorRTLLM(ITritonDeployable): """ @@ -95,6 +101,8 @@ def __init__( lora_ckpt_list: List[str] = None, load_model: bool = True, use_python_runtime: bool = True, + enable_chunked_context: bool = None, + max_tokens_in_paged_kv_cache: int = None, ): """ Args: @@ -104,9 +112,19 @@ def __init__( use_python_runtime (bool): whether to use python or c++ runtime. """ + if use_python_runtime: + if enable_chunked_context is not None or max_tokens_in_paged_kv_cache is not None: + raise Exception( + "enable_chunked_context and max_tokens_in_paged_kv_cache options " + "work only with the TensorRT-LLM C++ runtime. Please set " + "use_python_runtime=False to use these options." + ) + self.model_dir = model_dir self.lora_ckpt_list = lora_ckpt_list self.use_python_runtime = use_python_runtime + self.enable_chunked_context = enable_chunked_context if enable_chunked_context is not None else False + self.max_tokens_in_paged_kv_cache = max_tokens_in_paged_kv_cache self.model = None self.tokenizer = None self.n_gpus = None @@ -148,6 +166,10 @@ def export( max_lora_rank: int = 64, max_num_tokens: int = None, opt_num_tokens: int = None, + max_seq_len: int = None, + multiple_profiles: bool = False, + gpt_attention_plugin: str = "auto", + gemm_plugin: str = "auto", ): """ Exports nemo checkpoints to TensorRT-LLM. @@ -179,6 +201,10 @@ def export( max_lora_rank (int): maximum lora rank. max_num_tokens (int): opt_num_tokens (int): + max_seq_len (int): + multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False + gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto" + gemm_plugin (str): enable the gpt plugin. Default = "auto" """ if n_gpus is not None: @@ -233,7 +259,12 @@ def export( tmp_dir = tempfile.TemporaryDirectory() nemo_export_dir = Path(tmp_dir.name) - if is_qnemo_checkpoint(nemo_checkpoint_path): + is_qnemo_ckpt = False + if use_model_opt: + if is_qnemo_checkpoint(nemo_checkpoint_path): + is_qnemo_ckpt = True + + if is_qnemo_ckpt: if os.path.isdir(nemo_checkpoint_path): nemo_export_dir = nemo_checkpoint_path else: @@ -310,6 +341,10 @@ def export( paged_context_fmha=paged_context_fmha, max_num_tokens=max_num_tokens, opt_num_tokens=opt_num_tokens, + max_seq_len=max_seq_len, + multiple_profiles=multiple_profiles, + gpt_attention_plugin=gpt_attention_plugin, + gemm_plugin=gemm_plugin, ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") @@ -838,6 +873,8 @@ def _load(self): engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list, use_python_runtime=self.use_python_runtime, + enable_chunked_context=self.enable_chunked_context, + max_tokens_in_paged_kv_cache=self.max_tokens_in_paged_kv_cache, ) self._load_prompt_tables() except Exception as error: diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py old mode 100644 new mode 100755 index 2a78f6833782..60d50316e9ed --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -22,6 +22,8 @@ from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.functional import non_gated_version from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.models.gpt.config import GPTConfig +from tensorrt_llm.models.llama.config import LLaMAConfig from tensorrt_llm.models.modeling_utils import PretrainedConfig from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import ( @@ -33,6 +35,15 @@ LOGGER = logging.getLogger("NeMo") +def get_config(decoder_type, config): + if decoder_type == "llama": + return LLaMAConfig(**config) + elif decoder_type == "gpt" or decoder_type == "gptnext": + return GPTConfig(**config) + else: + return PretrainedConfig(**config) + + def prompt_convert(prompt_config, prompt_weights): if "task_templates" in prompt_config: prompt_templates = prompt_config["task_templates"] @@ -156,11 +167,13 @@ def model_to_trtllm_ckpt( 'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0), 'rotary_base': nemo_model_config.get('rotary_base', 10000), 'moe_num_experts': nemo_model_config.get('num_moe_experts', 0), - 'moe_top_k': nemo_model_config.get('moe_router_topk'), + 'moe_top_k': nemo_model_config.get('moe_router_topk', 0), 'moe_normalization_mode': nemo_model_config.get( 'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE ), - 'moe_tp_mode': nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL), + 'moe_tp_mode': nemo_model_config.get( + 'moe_tp_mode', 2 + ), # change MoeConfig.ParallelismMode.TENSOR_PARALLEL to 2 'logits_dtype': 'float32', 'world_size': world_size, 'tp_size': tensor_parallel_size, @@ -179,7 +192,7 @@ def model_to_trtllm_ckpt( if use_distributed_convert: config["gpus_per_node"] = gpus_per_node - model_configs.append(PretrainedConfig(**config)) + model_configs.append(get_config(decoder_type, config)) model_configs[0].mapping = tensorrt_llm.Mapping( world_size=world_size, rank=model_parallel_rank, @@ -258,7 +271,7 @@ def model_to_trtllm_ckpt( weights_dict_local["transformer.ln_f.bias"] = ln_f_bias config["gpus_per_node"] = gpus_per_node - model_config = PretrainedConfig(**config) + model_config = get_config(decoder_type, config) model_config.mapping = mapping model_configs.append(model_config) weights_dicts.append(weights_dict_local) diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py old mode 100644 new mode 100755 index 3768ff4b2844..eab17167cbd5 --- a/nemo/export/trt_llm/converter/utils.py +++ b/nemo/export/trt_llm/converter/utils.py @@ -26,7 +26,7 @@ DECODER_MODEL_TYPE = { "gptj": 'GPTForCausalLM', "gptnext": 'GPTForCausalLM', - "llama": 'LLaMAForCausalLM', + "llama": 'LlamaForCausalLM', "gemma": 'GemmaForCausalLM', "falcon": 'FalconForCausalLM', } diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py old mode 100644 new mode 100755 index b329de2a3b18..d04698c318bf --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -45,41 +45,51 @@ def build_and_save_engine( paged_kv_cache: bool = True, remove_input_padding: bool = True, paged_context_fmha: bool = False, - custom_all_reduce: bool = True, + use_custom_all_reduce: bool = True, use_refit: bool = False, max_num_tokens: int = None, + max_seq_len: int = None, opt_num_tokens: int = None, max_beam_width: int = 1, tokens_per_block: int = 128, + multiple_profiles: bool = False, + gpt_attention_plugin: str = "auto", + gemm_plugin: str = "auto", ): + architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture try: - model_cls = getattr(tensorrt_llm.models, model_config.architecture) + model_cls = getattr(tensorrt_llm.models, architecture) except: raise AttributeError(f"Could not find TRTLLM model type: {model_type}!") logger.set_level("info") - str_dtype = model_config.dtype plugin_config = PluginConfig() - plugin_config.set_gpt_attention_plugin(dtype=str_dtype) - plugin_config.set_gemm_plugin(dtype=str_dtype) - plugin_config.use_custom_all_reduce = custom_all_reduce - plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode) + plugin_config.gpt_attention_plugin = gpt_attention_plugin + plugin_config.gemm_plugin = gemm_plugin + plugin_config.set_nccl_plugin(use_custom_all_reduce=use_custom_all_reduce) + plugin_config.multi_block_mode = enable_multi_block_mode if paged_kv_cache: plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block) else: plugin_config.paged_kv_cache = False plugin_config.remove_input_padding = remove_input_padding plugin_config.use_paged_context_fmha = paged_context_fmha + plugin_config.multiple_profiles = multiple_profiles + + if max_seq_len is None: + max_seq_len = max_input_len + max_output_len max_num_tokens, opt_num_tokens = check_max_num_tokens( max_num_tokens=max_num_tokens, opt_num_tokens=opt_num_tokens, + max_seq_len=max_seq_len, max_batch_size=max_batch_size, max_input_len=max_input_len, max_beam_width=max_beam_width, remove_input_padding=remove_input_padding, enable_context_fmha=plugin_config.context_fmha, tokens_per_block=tokens_per_block, + multiple_profiles=multiple_profiles, ) build_dict = { @@ -87,6 +97,7 @@ def build_and_save_engine( 'max_output_len': max_output_len, 'max_batch_size': max_batch_size, 'max_beam_width': max_beam_width, + 'max_seq_len': max_seq_len, 'max_num_tokens': max_num_tokens, 'opt_num_tokens': opt_num_tokens, 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, @@ -95,11 +106,13 @@ def build_and_save_engine( 'strongly_typed': False, 'builder_opt': None, 'use_refit': use_refit, + 'multiple_profiles': multiple_profiles, } build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config) if use_lora_plugin is not None: - build_config.plugin_config.set_lora_plugin(use_lora_plugin) + # build_config.plugin_config.set_lora_plugin(use_lora_plugin) + # build_config.plugin_config._lora_plugin = use_lora_plugin lora_config = LoraConfig( lora_dir=lora_ckpt_list, lora_ckpt_source='nemo', diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index dbbf40cc3cf1..14ad0be699bb 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -26,15 +26,26 @@ import tensorrt_llm import torch from mpi4py.futures import MPIPoolExecutor -from tensorrt_llm.bindings import GptJsonConfig, GptSession, GptSessionConfig, KvCacheConfig, WorldConfig from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig -from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCppGptSession + from transformers import PreTrainedTokenizer LOGGER = logging.getLogger("NeMo") +use_trtllm_bindings = True +try: + from tensorrt_llm.bindings import GptJsonConfig, GptSession, GptSessionConfig, KvCacheConfig, WorldConfig +except Exception as e: + use_trtllm_bindings = False + +use_cpp_gpt_session = True +try: + from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCppGptSession +except Exception as e: + use_cpp_gpt_session = False + @dataclass class TensorrtLLMHostContext: @@ -131,6 +142,8 @@ def _load( lora_ckpt_list=None, num_beams=1, use_python_runtime: bool = True, + enable_chunked_context: bool = False, + max_tokens_in_paged_kv_cache: int = None, ): """The impl of `load` API for on a single GPU worker.""" try: @@ -145,7 +158,7 @@ def _load( max_batch_size = config["build_config"]["max_batch_size"] max_input_len = config["build_config"]["max_input_len"] - max_output_len = config["build_config"]["max_output_len"] + # max_output_len = config["build_config"]["max_output_len"] max_beam_width = config["build_config"]["max_beam_width"] runtime_rank = tensorrt_llm.mpi_rank() @@ -166,8 +179,10 @@ def _load( rank=runtime_rank, max_batch_size=max_batch_size, max_input_len=max_input_len, - max_output_len=max_output_len, + # max_output_len=max_output_len, max_beam_width=max_beam_width, + enable_chunked_context=enable_chunked_context, + max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache, debug_mode=False, ) @@ -279,6 +294,8 @@ def load( lora_ckpt_list: List[str] = None, num_beams: int = 1, use_python_runtime: bool = True, + enable_chunked_context: bool = False, + max_tokens_in_paged_kv_cache: int = None, ) -> TensorrtLLMHostContext: """Loaded the compiled LLM model and run it. @@ -290,17 +307,42 @@ def load( config = json.load(f) world_size = config["pretrained_config"]["mapping"]["world_size"] if world_size == 1: - _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) + _load( + tokenizer, + engine_dir, + lora_ckpt_list, + num_beams, + use_python_runtime, + enable_chunked_context, + max_tokens_in_paged_kv_cache, + ) executor = None elif tensorrt_llm.mpi_world_size() > 1: - _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) + _load( + tokenizer, + engine_dir, + lora_ckpt_list, + num_beams, + use_python_runtime, + enable_chunked_context, + max_tokens_in_paged_kv_cache, + ) executor = None tensorrt_llm.mpi_barrier() else: executor = MPIPoolExecutor(max_workers=world_size) futures = [] for _ in range(world_size): - future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) + future = executor.submit( + _load, + tokenizer, + engine_dir, + lora_ckpt_list, + num_beams, + use_python_runtime, + enable_chunked_context, + max_tokens_in_paged_kv_cache, + ) futures.append(future) for future in futures: future.result() diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 9d9f0fa200f0..01be9ff63a0d 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -80,7 +80,7 @@ def get_args(argv): parser.add_argument( "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion" ) - parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment") + parser.add_argument("-ng", "--num_gpus", default=None, type=int, help="Number of GPUs for the deployment") parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size") parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size") parser.add_argument( @@ -95,7 +95,13 @@ def get_args(argv): parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model") parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens") + parser.add_argument("-msl", "--max_seq_len", default=None, type=int, help="Maximum number of sequence length") + parser.add_argument("-mp", "--multiple_profiles", default=False, action='store_true', help="Multiple profiles") parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens") + parser.add_argument( + "-gap", "--gpt_attention_plugin", default="auto", type=str, help="dtype of gpt attention plugin" + ) + parser.add_argument("-gp", "--gemm_plugin", default="auto", type=str, help="dtype of gpt plugin") parser.add_argument( "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" ) @@ -284,6 +290,7 @@ def get_trtllm_deployable(args): max_batch_size=args.max_batch_size, max_num_tokens=args.max_num_tokens, opt_num_tokens=args.opt_num_tokens, + max_seq_len=args.max_seq_len, use_parallel_embedding=args.use_parallel_embedding, max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, paged_kv_cache=(not args.no_paged_kv_cache), @@ -293,6 +300,9 @@ def get_trtllm_deployable(args): use_lora_plugin=args.use_lora_plugin, lora_target_modules=args.lora_target_modules, max_lora_rank=args.max_lora_rank, + multiple_profiles=args.multiple_profiles, + gpt_attention_plugin=args.gpt_attention_plugin, + gemm_plugin=args.gemm_plugin, ) except Exception as error: raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) diff --git a/tests/export/run.sh b/tests/export/run.sh index e534e4e87ee9..a2366f0634ea 100644 --- a/tests/export/run.sh +++ b/tests/export/run.sh @@ -36,12 +36,9 @@ python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_ python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_tps 2 --max_tps 8 python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_tps 8 --max_tps 8 python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_tps 8 --max_tps 8 -python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_tps 1 --max_tps 8 -python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_tps 1 --max_tps 8 -python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_tps 1 --max_tps 8 -python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_tps 1 --max_tps 8 -python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_tps 1 --max_tps 8 -python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_tps 1 --max_tps 2 +python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_tps 1 --max_tps 1 python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_tps 2 --max_tps 8 python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_tps 1 --max_tps 1 -python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_tps 1 --max_tps 1 \ No newline at end of file +python tests/export/nemo_export.py --model_name STARCODER2-15B-base --existing_test_models --min_tps 1 --max_tps 1 +python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_tps 1 --max_tps 1 +python tests/export/nemo_export.py --model_name Nemotron3-22B-base-32k --existing_test_models --min_tps 2 \ No newline at end of file diff --git a/tests/infer_data_path.py b/tests/infer_data_path.py index 45850dcb366a..4125e77c0a1b 100644 --- a/tests/infer_data_path.py +++ b/tests/infer_data_path.py @@ -19,125 +19,22 @@ def get_infer_test_data(): test_data = {} - test_data["NV-GPT-8B-Base-4k"] = {} - test_data["NV-GPT-8B-Base-4k"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-Base-4k"]["min_tps"] = 1 - test_data["NV-GPT-8B-Base-4k"]["location"] = "Local" - test_data["NV-GPT-8B-Base-4k"]["model_dir"] = "/tmp/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/" - test_data["NV-GPT-8B-Base-4k"][ + test_data["Nemotron3-22B-base-32k"] = {} + test_data["Nemotron3-22B-base-32k"]["model_type"] = "gptnext" + test_data["Nemotron3-22B-base-32k"]["min_tps"] = 2 + test_data["Nemotron3-22B-base-32k"]["location"] = "Local" + test_data["Nemotron3-22B-base-32k"]["model_dir"] = "/tmp/Nemotron3-22B-base-32k/" + test_data["Nemotron3-22B-base-32k"][ "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/NV-GPT-8B-Base-4k.nemo" - test_data["NV-GPT-8B-Base-4k"]["p_tuning_checkpoint"] = "/opt/checkpoints/NV-GPT-8B-PTuning/nv-gpt-8B-ptuning.nemo" - test_data["NV-GPT-8B-Base-4k"]["prompt_template"] = [ + ] = "/opt/checkpoints/nemotron-3-22b-base-32k_v1.0/mcore-gpt3-22b-3_8T-pi32k-3_5T-cont-10k.nemo" + test_data["Nemotron3-22B-base-32k"]["prompt_template"] = [ "The capital of France is", "Largest animal in the sea is", "Fastest animal in the world is", ] - test_data["NV-GPT-8B-Base-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-Base-4k"]["max_output_len"] = 128 - test_data["NV-GPT-8B-Base-4k"]["max_batch_size"] = 10 - - test_data["NV-GPT-8B-Base-16k"] = {} - test_data["NV-GPT-8B-Base-16k"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-Base-16k"]["min_tps"] = 1 - test_data["NV-GPT-8B-Base-16k"]["location"] = "Local" - test_data["NV-GPT-8B-Base-16k"]["model_dir"] = "/tmp/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/" - test_data["NV-GPT-8B-Base-16k"][ - "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/NV-GPT-8B-Base-16k.nemo" - test_data["NV-GPT-8B-Base-16k"]["prompt_template"] = [ - "The capital of France is", - "Largest animal in the sea is", - "Fastest animal in the world is", - ] - test_data["NV-GPT-8B-Base-16k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-Base-16k"]["max_output_len"] = 128 - test_data["NV-GPT-8B-Base-16k"]["max_batch_size"] = 20 - - test_data["NV-GPT-8B-QA-4k"] = {} - test_data["NV-GPT-8B-QA-4k"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-QA-4k"]["min_tps"] = 1 - test_data["NV-GPT-8B-QA-4k"]["location"] = "Local" - test_data["NV-GPT-8B-QA-4k"]["model_dir"] = "/tmp/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/" - test_data["NV-GPT-8B-QA-4k"][ - "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/NV-GPT-8B-QA-4k.nemo" - test_data["NV-GPT-8B-QA-4k"]["prompt_template"] = [ - "What is the capital of France?", - "What is the largest animal in the sea?", - "What is the fastest animal in the world?", - ] - test_data["NV-GPT-8B-QA-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-QA-4k"]["max_output_len"] = 96 - test_data["NV-GPT-8B-QA-4k"]["max_batch_size"] = 20 - - test_data["NV-GPT-8B-Chat-4k-SFT"] = {} - test_data["NV-GPT-8B-Chat-4k-SFT"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-Chat-4k-SFT"]["min_tps"] = 1 - test_data["NV-GPT-8B-Chat-4k-SFT"]["location"] = "Local" - test_data["NV-GPT-8B-Chat-4k-SFT"]["model_dir"] = "/tmp/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/" - test_data["NV-GPT-8B-Chat-4k-SFT"][ - "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/NV-GPT-8B-Chat-4k-SFT.nemo" - test_data["NV-GPT-8B-Chat-4k-SFT"]["prompt_template"] = [ - "What is the capital of France?", - "What is the largest animal in the sea?", - "What is the fastest animal in the world?", - ] - test_data["NV-GPT-8B-Chat-4k-SFT"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-Chat-4k-SFT"]["max_output_len"] = 256 - test_data["NV-GPT-8B-Chat-4k-SFT"]["max_batch_size"] = 5 - - test_data["NV-GPT-8B-Chat-4k-RLHF"] = {} - test_data["NV-GPT-8B-Chat-4k-RLHF"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-Chat-4k-RLHF"]["min_tps"] = 1 - test_data["NV-GPT-8B-Chat-4k-RLHF"]["location"] = "Local" - test_data["NV-GPT-8B-Chat-4k-RLHF"]["model_dir"] = "/tmp/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/" - test_data["NV-GPT-8B-Chat-4k-RLHF"][ - "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/NV-GPT-8B-Chat-4k-RLHF.nemo" - test_data["NV-GPT-8B-Chat-4k-RLHF"]["prompt_template"] = [ - "What is the capital of France?", - "What is the largest animal in the sea?", - "What is the fastest animal in the world?", - ] - test_data["NV-GPT-8B-Chat-4k-RLHF"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_output_len"] = 128 - test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_batch_size"] = 10 - - test_data["NV-GPT-8B-Chat-4k-SteerLM"] = {} - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["model_type"] = "gptnext" - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["min_tps"] = 1 - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["location"] = "Local" - test_data["NV-GPT-8B-Chat-4k-SteerLM"][ - "model_dir" - ] = "/tmp/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/" - test_data["NV-GPT-8B-Chat-4k-SteerLM"][ - "checkpoint" - ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/NV-GPT-8B-Chat-4k-SteerLM.nemo" - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["prompt_template"] = [ - "What is the capital of France?", - "What is the largest animal in the sea?", - "What is the fastest animal in the world?", - ] - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_output_len"] = 128 - test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_batch_size"] = 10 - - test_data["GPT-43B-Base"] = {} - test_data["GPT-43B-Base"]["model_type"] = "gptnext" - test_data["GPT-43B-Base"]["min_tps"] = 2 - test_data["GPT-43B-Base"]["location"] = "Local" - test_data["GPT-43B-Base"]["model_dir"] = "/tmp/GPT-43B-Base/gpt-43B-base/" - test_data["GPT-43B-Base"]["checkpoint"] = "/opt/checkpoints/GPT-43B-Base/gpt-43B-base.nemo" - test_data["GPT-43B-Base"]["prompt_template"] = [ - "The capital of France is", - "Largest animal in the sea is", - "Fastest animal in the world is", - ] - test_data["GPT-43B-Base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] - test_data["GPT-43B-Base"]["max_output_len"] = 128 - test_data["GPT-43B-Base"]["max_batch_size"] = 10 + test_data["Nemotron3-22B-base-32k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"] + test_data["Nemotron3-22B-base-32k"]["max_output_len"] = 128 + test_data["Nemotron3-22B-base-32k"]["max_batch_size"] = 10 test_data["LLAMA2-7B-base"] = {} test_data["LLAMA2-7B-base"]["model_type"] = "llama" @@ -367,6 +264,17 @@ def get_infer_test_data(): test_data["STARCODER1-15B-base"]["max_output_len"] = 128 test_data["STARCODER1-15B-base"]["max_batch_size"] = 5 + test_data["STARCODER2-15B-base"] = {} + test_data["STARCODER2-15B-base"]["model_type"] = "starcoder" + test_data["STARCODER2-15B-base"]["min_tps"] = 1 + test_data["STARCODER2-15B-base"]["location"] = "Local" + test_data["STARCODER2-15B-base"]["model_dir"] = "/tmp/STARCODER2-15B-base/trt_llm_model-1/" + test_data["STARCODER2-15B-base"]["checkpoint"] = "/opt/checkpoints/starcoder-2_15b_4k_vfinal/4194b.nemo" + test_data["STARCODER2-15B-base"]["prompt_template"] = ["def fibonnaci(n"] + test_data["STARCODER2-15B-base"]["expected_keyword"] = ["fibonnaci"] + test_data["STARCODER2-15B-base"]["max_output_len"] = 128 + test_data["STARCODER2-15B-base"]["max_batch_size"] = 5 + test_data["GEMMA-base"] = {} test_data["GEMMA-base"]["model_type"] = "gemma" test_data["GEMMA-base"]["min_tps"] = 1 From fc0e4ab09025a8584343c8c9818f748a62597c1a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 26 Jul 2024 08:27:45 -0700 Subject: [PATCH 7/8] add code owner (#9917) --- .github/CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000000..ef7434efe377 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +.github/ @pablo-garay @ko3n1g +Dockerfile.ci @pablo-garay @ko3n1g From 67aee7fb975e44bdebe1840527725a095b22580c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 19:39:58 +0400 Subject: [PATCH 8/8] Fix Docker build. Make Dockerfile consistent with CI (#9784) (#9915) * Fix Docker build. Make Dockerfile consistent with CI --------- Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev --- Dockerfile.speech | 24 ++++++++++++++++++------ scripts/installers/install_k2.sh | 2 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Dockerfile.speech b/Dockerfile.speech index cfe7d9eb5fdc..e7cc670a132d 100644 --- a/Dockerfile.speech +++ b/Dockerfile.speech @@ -62,23 +62,28 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* WORKDIR /workspace/ + +ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea +ARG MCORE_TAG=338af51452a53982d202e8386db6233adad1ce86 +ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c # Install megatron core, this can be removed once 0.3 pip package is released # We leave it here in case we need to work off of a specific commit in main RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout c7a1f82d761577e6ca0338d3521eac82f2aa0904 && \ + git checkout ${MCORE_TAG} && \ pip install . # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771 RUN git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ - git checkout f058162b215791b15507bb542f22ccfde49c872d && \ - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ + git checkout ${APEX_TAG} && \ + pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir \ + --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ # Transformer Engine 1.2.0 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \ cd TransformerEngine && \ - git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \ + git fetch origin ${TE_TAG} && \ git checkout FETCH_HEAD && \ git submodule init && git submodule update && \ NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . @@ -126,7 +131,9 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL WORKDIR /tmp/nemo ENV LHOTSE_REQUIRE_TORCHAUDIO=0 COPY requirements . -RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done +# exclude requirements_vllm.txt, since `vllm==0.5.x` breaks the container due to hardcoded requirements `torch==2.3.0` +RUN for f in $(ls requirements*.txt | grep -v 'requirements_vllm.txt'); do \ + pip3 install --disable-pip-version-check --no-cache-dir -r $f; done # install flash attention RUN pip install flash-attn @@ -151,7 +158,12 @@ RUN /usr/bin/test -n "$NEMO_VERSION" && \ RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]" # Check install -RUN python -c "import nemo.collections.nlp as nemo_nlp" && \ +# NB: adjusting LD_LIBRARY_PATH (only here, should not be persistent!) is a temporary hack +# to avoid failure if CUDA is unavailable (`docker build` does not expose GPUs) +# The error is raised in NeMo Core, and the main reason is reinstalled Transformer-Engine; +RUN export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CUDA_HOME}/compat/lib.real && \ + python -c "import nemo.collections.asr as nemo_asr" && \ + python -c "import nemo.collections.nlp as nemo_nlp" && \ python -c "import nemo.collections.tts as nemo_tts" && \ python -c "import nemo_text_processing.text_normalization as text_normalization" diff --git a/scripts/installers/install_k2.sh b/scripts/installers/install_k2.sh index 18d948209ab8..6de80ecae3eb 100755 --- a/scripts/installers/install_k2.sh +++ b/scripts/installers/install_k2.sh @@ -15,7 +15,7 @@ # limitations under the License. K2_REPO=https://github.com/k2-fsa/k2 -LATEST_RELEASE=525cfa5 # fix for PyTorch 2.2.0 +LATEST_RELEASE=5735fa7 # fix for PyTorch 2.4.0 # uncomment the following line after the next k2 version is released (>1.24.4) #LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \ # ls-remote --exit-code --refs --sort='version:refname' --tags ${K2_REPO} '*.*' \