From da140eb408b69501c2b6b57a59d68ba6af82ab4f Mon Sep 17 00:00:00 2001 From: Taejin Park Date: Sun, 7 Jul 2024 19:38:57 -0700 Subject: [PATCH 01/13] Fix the arguments of forward_for_export function in msdd_models (#9624) * Fix the arguments of forward_for_export function Signed-off-by: Taejin Park * Apply isort and black reformatting Signed-off-by: tango4j --------- Signed-off-by: Taejin Park Signed-off-by: tango4j Co-authored-by: tango4j --- nemo/collections/asr/models/msdd_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py index 60aae8d1a4b1..c88275dcacd3 100644 --- a/nemo/collections/asr/models/msdd_models.py +++ b/nemo/collections/asr/models/msdd_models.py @@ -565,7 +565,7 @@ def forward( self.msdd._speaker_model.train() if len(detach_ids[0]) > 1: logits, embs_a = self.msdd._speaker_model.forward_for_export( - processed_signal=audio_signal[detach_ids[0]], processed_signal_len=audio_signal_len[detach_ids[0]] + audio_signal=audio_signal[detach_ids[0]], length=audio_signal_len[detach_ids[0]] ) embs[detach_ids[0], :] = embs_a From ab1d72235fb5c3add05169719e0572c8ac186aaa Mon Sep 17 00:00:00 2001 From: mikolajblaz Date: Mon, 8 Jul 2024 12:16:26 +0200 Subject: [PATCH 02/13] Change default parallel_save to False (#9632) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mikołaj Błaż --- nemo/utils/callbacks/dist_ckpt_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py index 65eea827e851..144c07addaa8 100644 --- a/nemo/utils/callbacks/dist_ckpt_io.py +++ b/nemo/utils/callbacks/dist_ckpt_io.py @@ -205,7 +205,7 @@ def __init__( async_save: bool = False, torch_dist_multiproc: Optional[int] = None, assume_constant_structure: bool = False, - parallel_save: bool = True, + parallel_save: bool = False, parallel_load: bool = False, ): super().__init__() @@ -238,7 +238,7 @@ def from_config(cls, model_cfg: dict, async_save: bool = False): load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True), async_save=async_save, torch_dist_multiproc=model_cfg.get('dist_ckpt_torch_dist_multiproc', None), - parallel_save=model_cfg.get('dist_ckpt_parallel_save', True), + parallel_save=model_cfg.get('dist_ckpt_parallel_save', False), parallel_load=model_cfg.get('dist_ckpt_parallel_load', False), ) From c0cd8d4567a6360b28f51751eabedd4bd1a76177 Mon Sep 17 00:00:00 2001 From: mikolajblaz Date: Mon, 8 Jul 2024 12:16:54 +0200 Subject: [PATCH 03/13] Unwrap ckpt_io for model opt (async save) (#9622) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mikołaj Błaż --- nemo/collections/nlp/parts/nlp_overrides.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 0b89bfda8dbd..e251690831cb 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -395,7 +395,7 @@ def save_checkpoint( save_sharded_modelopt_state( self.lightning_module.get_model_module_list(), ckpt_to_dir(filepath), - self.checkpoint_io.save_sharded_strategy, + self.unwrapped_checkpoint_io.save_sharded_strategy, prefix="model.", ) else: @@ -595,10 +595,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None: @property def use_distributed_checkpointing(self): - checkpoint_io = self.checkpoint_io - while isinstance(checkpoint_io, _WrappingCheckpointIO): - checkpoint_io = checkpoint_io.checkpoint_io - has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(checkpoint_io, DistributedCheckpointIO) + has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(self.unwrapped_checkpoint_io, DistributedCheckpointIO) has_sharded_state_dict = ( hasattr(self.lightning_module, 'sharded_state_dict') and self.lightning_module.sharded_state_dict() is not None @@ -638,6 +635,14 @@ def restore_checkpoint_after_setup(self) -> bool: """ return True + @property + def unwrapped_checkpoint_io(self) -> CheckpointIO: + """Returns CheckpointIO unwrapped from any _WrappedCheckpointIO wrappers.""" + checkpoint_io = self.checkpoint_io + while isinstance(checkpoint_io, _WrappingCheckpointIO): + checkpoint_io = checkpoint_io.checkpoint_io + return checkpoint_io + class NLPDDPStrategyNotebook(NLPDDPStrategy): """Version of NLPDDPStrategy to be used in a Jupyter Notebook @@ -1011,6 +1016,8 @@ def dummy(): checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir) if HAVE_MODELOPT and hasattr(model, "get_model_module_list"): + while isinstance(checkpoint_io, _WrappingCheckpointIO): + checkpoint_io = checkpoint_io.checkpoint_io save_sharded_modelopt_state( model.get_model_module_list(), dist_ckpt_dir, From 575283a9d60037bab88baf675c27f21361bec933 Mon Sep 17 00:00:00 2001 From: huvunvidia <86480512+huvunvidia@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:06:32 -0400 Subject: [PATCH 04/13] MCore T5 support for NeMo - Training (#9432) * huvu/mcore_t5 first commit from local * removing DEBUGGING prints * cleaning megatron_lm_encoder_decoder_model.py code * cleaning code * adding Github action test * only run mcore T5 test * only run mcore T5 test * only run mcore T5 test * only run mcore T5 test * reset .github/workflows/cicd-main.yml * reset .github/workflows/cicd-main.yml * adding condition self.mcore_t5 when running self.build_transformer_config() * refractor megatron_lm_encoder_decoder_model.py to not use self.model * only run T5-related tests * remove all self.model * reset cicd file * reset cicd file * updating codes remove duplicate if/else; adding mcore/transformer_engine to config file * adjust +model.mcore_t5=True * Apply isort and black reformatting Signed-off-by: huvunvidia --------- Signed-off-by: huvunvidia Co-authored-by: Huy Vu2 Co-authored-by: huvunvidia --- .github/workflows/cicd-main.yml | 75 ++++ .../conf/megatron_t5_config.yaml | 4 + .../language_modeling/megatron_base_model.py | 34 +- .../megatron_lm_encoder_decoder_model.py | 369 +++++++++++++++--- 4 files changed, 425 insertions(+), 57 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 44ecb03acc7b..d225ee3ab429 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3488,6 +3488,80 @@ jobs: rm -rf examples/nlp/language_modeling/t5_pretrain_results rm -rf examples/nlp/language_modeling/t5_index_mappings + L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=null \ + trainer.max_steps=10 \ + trainer.val_check_interval=10 \ + trainer.accumulate_grad_batches=1 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.mcore_t5=True \ + model.transformer_engine=True \ + model.tensor_model_parallel_size=2 \ + model.micro_batch_size=4 \ + model.global_batch_size=4 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.encoder.transformer_block_type='pre_ln' \ + model.decoder.transformer_block_type='pre_ln' \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=null \ + trainer.max_steps=10 \ + trainer.val_check_interval=10 \ + trainer.accumulate_grad_batches=1 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_t5=True \ + model.transformer_engine=True \ + model.tensor_model_parallel_size=2 \ + model.micro_batch_size=4 \ + model.global_batch_size=4 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.encoder.transformer_block_type='pre_ln' \ + model.decoder.transformer_block_type='pre_ln' \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings + L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4433,6 +4507,7 @@ jobs: - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2 - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2 - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2 + - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml index e51cfff420a3..439a0f1533bd 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml @@ -43,6 +43,10 @@ exp_manager: model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} model: + # use T5 model from megatron.core + mcore_t5: False + transformer_engine: False + # model parallelism micro_batch_size: 4 global_batch_size: 8 # will use more micro batches to reach global batch size diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index f7b53a95c19a..7308d3db3f91 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -290,7 +290,11 @@ def _wrap_model_for_O2(self): Returns: The wrapped model. Returns a list of wrapped modules or a single wrapped module. """ - is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False) + is_mcore_model = ( + self.__dict__.get('mcore_gpt', False) + or self.__dict__.get('mcore_bert', False) + or self.__dict__.get('mcore_t5', False) + ) Float16Wrapper = MCoreFloat16Module if is_mcore_model else Float16Module @@ -305,15 +309,21 @@ def _wrap_model_for_O2(self): args = mcore_args if is_mcore_model else nemo_args # Model wrapper to convert both model and inputs to half precision - if isinstance(self.model, list): + if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list): converted_model = [] - for module in self.model: + for module in self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model: args['module'] = module converted_model.append(Float16Wrapper(**args)) - self.model = converted_model + if hasattr(self, "enc_dec_model"): + self.enc_dec_model = converted_model + else: + self.model = converted_model else: - args['module'] = self.model - self.model = Float16Wrapper(**args) + args['module'] = self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model + if hasattr(self, "enc_dec_model"): + self.enc_dec_model = Float16Wrapper(**args) + else: + self.model = Float16Wrapper(**args) args.pop('module') def get_model_module_list(self): @@ -323,10 +333,10 @@ def extract_module(model): else: return model - if isinstance(self.model, list): - return list(map(extract_module, self.model)) + if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list): + return list(map(extract_module, (self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model))) else: - return [extract_module(self.model)] + return [extract_module(self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model)] def _reconfigure_limit_batches(self, limit_batches, dataloader, mode): """ @@ -1022,7 +1032,11 @@ def is_data_parallel_rank_zero(self): def _get_total_params_across_model_parallel_groups_gpt_bert(self): """Returns the total number of parameters across all model parallel groups.""" - is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False) + is_mcore_model = ( + self.__dict__.get('mcore_gpt', False) + or self.__dict__.get('mcore_bert', False) + or self.__dict__.get('mcore_t5', False) + ) # log number of parameters model = self.get_model_module_list() if isinstance(model, list): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 8fe215bcc9af..6609b1aff303 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -32,11 +32,13 @@ from nemo.collections.nlp.modules.common.megatron.build_model import build_model from nemo.collections.nlp.modules.common.megatron.module import Float16Module from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import ( + AttnMaskType, MegatronTokenLevelEncoderDecoderModule, ) from nemo.collections.nlp.modules.common.megatron.utils import ( ApexGuardDefaults, average_losses_across_data_parallel_group, + build_attention_mask_3d, get_params_for_weight_decay_optimization, ) from nemo.collections.nlp.modules.common.text_generation_utils import ( @@ -62,7 +64,16 @@ try: from megatron.core import parallel_state, tensor_parallel from megatron.core.enums import ModelType + from megatron.core.models.T5 import T5Model as MCoreT5Model + from megatron.core.models.T5.t5_spec import ( + get_t5_decoder_with_local_block_spec, + get_t5_decoder_with_transformer_engine_block_spec, + get_t5_encoder_with_local_block_spec, + get_t5_encoder_with_transformer_engine_block_spec, + ) from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.module import Float16Module as MCoreFloat16Module + from megatron.core.transformer.transformer_config import TransformerConfig HAVE_MEGATRON_CORE = True @@ -96,6 +107,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): # Make sure trainer.accumulate_grad_batches is 1. self._validate_trainer() + self.mcore_t5 = cfg.get('mcore_t5', False) + + if self.mcore_t5: + self.transformer_config = self.build_transformer_config() + + self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False) + # TODO: Currently does not support interleaved pipeline parallelism. # This means we can only use pipeline parallelism without the interleaved schedule. if isinstance(self.trainer.accelerator, CPUAccelerator): @@ -116,18 +134,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): # We don't need to call it explicitly? Since it is a pytorch lightning hook function # self.setup_optimizer_param_groups() - self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False) - if self.megatron_amp_O2: if not self.with_distributed_adam: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type - self.enc_dec_model.cuda(torch.cuda.current_device()) + if isinstance(self.enc_dec_model, list): + for module in self.enc_dec_model: + module.cuda(torch.cuda.current_device()) + else: + self.enc_dec_model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision - self.enc_dec_model = Float16Module( - config=self.model_parallel_config, module=self.enc_dec_model, precision=self.cfg.precision - ) + self._wrap_model_for_O2() self.enable_autocast = ( True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False @@ -250,38 +268,74 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder.arch == 'perceiver': raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.") - if not hasattr(self.cfg, 'embedding_init_method_std'): - embedding_init_method_std = self.cfg.encoder.init_method_std - else: - embedding_init_method_std = self.cfg.embedding_init_method_std + if hasattr(self, 'mcore_t5') and self.mcore_t5: + assert HAVE_MEGATRON_CORE, "Cannot use MCore T5 since Megatron Core is not found" + assert self.cfg.get( + 'share_token_embeddings', True + ), "share_token_embeddings must be True if using MCore T5 model" + if self.cfg.get('transformer_engine', False): + enc_dec_spec_fns = ( + get_t5_encoder_with_transformer_engine_block_spec, + get_t5_decoder_with_transformer_engine_block_spec, + ) + else: + enc_dec_spec_fns = ( + get_t5_encoder_with_local_block_spec, + get_t5_decoder_with_local_block_spec, + ) + + en_block_spec = enc_dec_spec_fns[0](self.cfg.encoder.num_layers) + de_block_spec = enc_dec_spec_fns[1](self.cfg.decoder.num_layers) + model = MCoreT5Model( + config=self.transformer_config, + transformer_encoder_layer_spec=en_block_spec, + transformer_decoder_layer_spec=de_block_spec, + vocab_size=self.padded_vocab_size, + max_sequence_length=self.cfg.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), + parallel_output=True, + share_embeddings_and_output_weights=self.cfg.get('share_decoder_tokens_head_embeddings', True), + position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), + rotary_percent=self.cfg.get('rotary_percentage', 1.0), + seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), + ) - if not hasattr(self.cfg, 'embedding_dropout'): - embedding_dropout = self.cfg.encoder.hidden_dropout else: - embedding_dropout = self.cfg.embedding_dropout - - model = MegatronTokenLevelEncoderDecoderModule( - config=self.model_parallel_config, - encoder_cfg=self.cfg.encoder, - decoder_cfg=self.cfg.decoder, - vocab_size=self.padded_vocab_size, - max_position_embeddings=self.cfg.max_position_embeddings, - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), - precision=self.cfg.get('precision', 16), - embedding_init_method_std=embedding_init_method_std, - embedding_dropout=embedding_dropout, - label_smoothing=self.cfg.get('label_smoothing', 0.0), - add_encoder=add_encoder, - add_decoder=add_decoder, - share_token_embeddings=self.cfg.get('share_token_embeddings', True), - share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True), - tokens_head_bias=self.cfg.get('tokens_head_bias', True), - hiddens_cfg=self.cfg.get('hiddens', None), - ) + if not hasattr(self.cfg, 'embedding_init_method_std'): + embedding_init_method_std = self.cfg.encoder.init_method_std + else: + embedding_init_method_std = self.cfg.embedding_init_method_std + + if not hasattr(self.cfg, 'embedding_dropout'): + embedding_dropout = self.cfg.encoder.hidden_dropout + else: + embedding_dropout = self.cfg.embedding_dropout + + model = MegatronTokenLevelEncoderDecoderModule( + config=self.model_parallel_config, + encoder_cfg=self.cfg.encoder, + decoder_cfg=self.cfg.decoder, + vocab_size=self.padded_vocab_size, + max_position_embeddings=self.cfg.max_position_embeddings, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), + precision=self.cfg.get('precision', 16), + embedding_init_method_std=embedding_init_method_std, + embedding_dropout=embedding_dropout, + label_smoothing=self.cfg.get('label_smoothing', 0.0), + add_encoder=add_encoder, + add_decoder=add_decoder, + share_token_embeddings=self.cfg.get('share_token_embeddings', True), + share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True), + tokens_head_bias=self.cfg.get('tokens_head_bias', True), + hiddens_cfg=self.cfg.get('hiddens', None), + ) + return model def forward( @@ -372,6 +426,25 @@ def training_step(self, dataloader_iter): # we zero grads here because we also call backward in the megatron fwd/bwd functions self._optimizer.zero_grad() + if self.with_distributed_adam: + # hack to enable overlapping param sync and forward compute + # note: the distributed optimizer monkey-patches each + # parameter's __getattribute__ function so that it can + # launch parameter all-gathers the first time the + # parameter is accessed after the optimizer step. However, + # PyTorch directly passes embedding parameters into a C++, + # bypassing this process. A quick-and-dirty hack is to + # manually interact with the parameter. + modules = self.enc_dec_model if isinstance(self.enc_dec_model, list) else [self.enc_dec_model] + for module in modules: + if isinstance(module, (Float16Module, MCoreFloat16Module)): + module = module.module + if not self.mcore_t5: + module = module.language_model + if hasattr(module, 'embedding'): + for param in module.embedding.parameters(): + param.data_ptr() + loss_dict = self.fwd_bwd_step(dataloader_iter, False) if self.with_distributed_adam: @@ -380,8 +453,12 @@ def training_step(self, dataloader_iter): # from multiple simultaneous NCCL calls self._optimizer._finish_bucket_grad_sync() elif self.megatron_amp_O2: - # when using pipeline parallelism grads must be reduced after the pipeline (not asynchronously) - if self.cfg.get('pipeline_model_parallel_size', 1) > 1: + # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously) + if ( + self.cfg.get('pipeline_model_parallel_size', 1) > 1 + or self.cfg.get('sequence_parallel', False) + or not self.cfg.get('async_grad_allreduce', True) + ): # main grads are stored in the MainParamsOptimizer wrapper self._optimizer.allreduce_main_grads() else: @@ -596,15 +673,37 @@ def fwd_output_and_loss_func(dataloader_iter, model): batch_data, ) = batch - output = model( - encoder_input_ids, # enc_input_ids - encoder_attn_mask, # enc_attn_mask - decoder_input_ids, # dec_input_ids - decoder_attn_mask, # dec_attn_mask - None, # token_type_ids - lm_labels, # labels - batch_data, # batch_data - ) + if self.mcore_t5: + # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM + encoder_attn_mask_3d = build_attention_mask_3d( + encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding + ) + decoder_attn_mask_3d = build_attention_mask_3d( + decoder_attn_mask, decoder_attn_mask, AttnMaskType.causal + ) + enc_dec_attn_mask_3d = build_attention_mask_3d( + decoder_attn_mask, encoder_attn_mask, AttnMaskType.padding + ) + + output = model( # model is MCoreT5Model + encoder_input_ids, # encoder_input_ids + decoder_input_ids, # decoder_input_ids + encoder_attn_mask_3d, # encoder_attn_mask + decoder_attn_mask_3d, # decoder_attn_mask + enc_dec_attn_mask_3d, # encoder_decoder_attn_mask + lm_labels, # lm_labels + ) + + else: + output = model( + encoder_input_ids, # enc_input_ids + encoder_attn_mask, # enc_attn_mask + decoder_input_ids, # dec_input_ids + decoder_attn_mask, # dec_attn_mask + None, # token_type_ids + lm_labels, # labels + batch_data, # batch_data + ) def loss_func(output_tensor): if isinstance(output_tensor, dict): @@ -983,6 +1082,36 @@ def setup(self, stage=None): ) == 'relative' and not self.cfg.decoder.get('relative_position_bias_self_attention_only', True): self.enc_dec_model.sync_initial_decoder_cross_attention_relative_position_embeddings() + if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_t5', False): + self.setup_transformer_engine_tp_groups() + + def setup_transformer_engine_tp_groups(self): + """This should be called after model parallel groups have been initialized + and only needs to be called when using Transformer Engine. + """ + for module in self.get_t5_module_list(): + """Set TP group + Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 + """ + # Deep iterate but skip self to avoid infinite recursion. + for index, child in enumerate(module.modules()): + if index == 0: + continue + if hasattr(child, "set_tensor_parallel_group"): + tp_group = parallel_state.get_tensor_model_parallel_group() + child.set_tensor_parallel_group(tp_group) + + def get_t5_module_list(self): + if isinstance(self.enc_dec_model, list): + return [ + model.module if isinstance(model, (Float16Module, MCoreFloat16Module)) else model + for model in self.enc_dec_model + ] + elif isinstance(self.enc_dec_model, (Float16Module, MCoreFloat16Module)): + return [self.enc_dec_model.module] + else: + return [self.enc_dec_model] + def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): consumed_samples = self.compute_consumed_samples(0) @@ -1536,3 +1665,149 @@ def build_model_parallel_config(self): f'encoder.hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.' ) return model_parallel_config + + def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]: + """ + Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk. + When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to + self.state_dict(). + The sharded tensor mapping is defined in the GPTModel class from mcore. + """ + if self.mcore_t5: + module_prefix = f'{prefix}model.' + sharded_state_dict = {} + for index, module in enumerate(self.get_model_module_list()): + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict + parallel_state.set_virtual_pipeline_model_parallel_rank(index) + module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix) + sharded_state_dict[f'model_{index}'] = module_sharded_state_dict + else: + module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix) + sharded_state_dict.update(module_sharded_state_dict) + + # reset vp rank + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + + return sharded_state_dict + + def on_save_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint + """ + if self.mcore_t5: + checkpoint['sharded_state_dict'] = self.sharded_state_dict() + else: + if isinstance(self.enc_dec_model, list): + for i in range(len(self.enc_dec_model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + checkpoint[f'model{i}'] = self.enc_dec_model[i].module.state_dict_for_save_checkpoint() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + + def on_load_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint + """ + if self.mcore_t5: + if 'state_dict' in checkpoint and checkpoint['state_dict']: + for index, module in enumerate(self.get_model_module_list()): + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}'] + else: + checkpoint_state_dict = checkpoint['state_dict'] + # checkpoint_state_dict has "model." but module does not so we need to remove it when loading + checkpoint_state_dict = { + key.replace('model.', ''): checkpoint_state_dict.pop(key) + for key in list(checkpoint_state_dict.keys()) + } + + # addressing the current T5 mcore version's implementation of sharded_state_dict + checkpoint_state_dict['lm_head.output_layer.bias'] = checkpoint_state_dict['output_layer.bias'] + + module.load_state_dict(checkpoint_state_dict, strict=True) + else: + checkpoint['state_dict'] = {} + else: + if isinstance(self.enc_dec_model, list): + for i in range(len(self.enc_dec_model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + self.enc_dec_model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + + def build_transformer_config(self) -> TransformerConfig: + """Builds the megatron core gpt transformer config for the model. + For attributes in the nemo model config that are the same + as the megatron core TransformerConfig, we will use the value from the nemo model config. + For attributes in TransformerConfig that are not in the nemo model config, we add custom logic. + """ + + # for T5 model, transformers hyperparameters are stored in self.cfg.encoder/self.cfg.decoder + with open_dict(self.cfg): + for key in self.cfg.encoder: + print("{}: {}".format(key, self.cfg.encoder.get(key))) + OmegaConf.update(self.cfg, key, self.cfg.encoder.get(key)) + + normalization = self.cfg.get('normalization', 'layernorm') + + layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' + if normalization == 'layernorm': + normalization = 'LayerNorm' + elif normalization == 'rmsnorm': + normalization = 'RMSNorm' + elif normalization == 'layernorm1p': + normalization = 'LayerNorm' + layernorm_zero_centered_gamma = True + else: + logging.warning( + f"The normalization type: {normalization} might not be supported in megatron core." + f"Supported types are LayerNorm and RMSNorm." + ) + + # any configs that are not in the nemo model config will be added here + model_specific_configs = { + 'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma, + 'normalization': normalization, + } + + transformer_config = super().build_transformer_config() + + for key, value in model_specific_configs.items(): + setattr(transformer_config, key, value) + + # pass mcore customization configs directly to mcore + mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {}) + for key, value in mcore_customization_config_dict.items(): + setattr(transformer_config, key, value) + + return transformer_config + + def setup_mcore_distributed_parallel(self): + """Set up mcore distributed data parallel""" + if self.with_distributed_adam and self.use_mcore_dist_optim: + config = get_model_config(self.enc_dec_model[0]) + ddp_config = DistributedDataParallelConfig( + grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'), + overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False), + use_distributed_optimizer=True, + check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False), + # mcore bucket_size is based on num of parameters, therefore not + # using bucket_cap_mb to configure bucket_size here + bucket_size=self.cfg.optim.get('ddp_bucket_size', None), + ) + self.enc_dec_model = [ + McoreDDP( + config, + ddp_config, + model_chunk, + data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0), + ) + for (model_chunk_idx, model_chunk) in enumerate(self.enc_dec_model) + ] + + # (TODO) Broadcast params from data parallel src rank to other data parallel ranks. + # by calling model_module.broadcast_params() if the model is randomly initialized. From 17f295beb207a31c3f4dea40e311ccef3cbc08ff Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Mon, 8 Jul 2024 17:50:37 +0200 Subject: [PATCH 05/13] [Nemo-UX] Expose transformer_layer_spec inside GPTConfig (#9592) * Expose transformer_layer_spec inside GPTConfig * Apply isort and black reformatting Signed-off-by: marcromeyn * Expose layer-specs * Apply isort and black reformatting Signed-off-by: marcromeyn --------- Signed-off-by: marcromeyn Co-authored-by: marcromeyn --- nemo/collections/llm/gpt/model/__init__.py | 4 +++ nemo/collections/llm/gpt/model/base.py | 33 +++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 1dac811f91ef..4391a41293ee 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -4,6 +4,8 @@ MaskedTokenLossReduction, gpt_data_step, gpt_forward_step, + local_layer_spec, + transformer_engine_layer_spec, ) from nemo.collections.llm.gpt.model.gemma import ( CodeGemmaConfig2B, @@ -56,4 +58,6 @@ "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step", + "transformer_engine_layer_spec", + "local_layer_spec", ] diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 28a0eed52a5f..4c1f425d7f99 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -1,10 +1,12 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional +from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union import pytorch_lightning as L import torch import torch.distributed +from megatron.core.models.gpt import gpt_layer_specs from megatron.core.optimizer import OptimizerConfig +from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_config import TransformerConfig from torch import nn @@ -63,6 +65,18 @@ def gpt_forward_step(model, batch) -> torch.Tensor: return model(**forward_args) +def transformer_engine_layer_spec(config: "GPTConfig") -> ModuleSpec: + return gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec( + num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm + ) + + +def local_layer_spec(config: "GPTConfig") -> ModuleSpec: + return gpt_layer_specs.get_gpt_layer_local_spec( + num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm + ) + + @dataclass class GPTConfig(TransformerConfig, io.IOMixin): # From megatron.core.models.gpt.gpt_model.GPTModel @@ -79,6 +93,7 @@ class GPTConfig(TransformerConfig, io.IOMixin): # TODO: Move this to better places? get_attention_mask_from_fusion: bool = False + transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = transformer_engine_layer_spec forward_step_fn: Callable = gpt_forward_step data_step_fn: Callable = gpt_data_step @@ -91,12 +106,15 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages." from megatron.core import parallel_state - from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel + transformer_layer_spec = self.transformer_layer_spec + if not isinstance(transformer_layer_spec, ModuleSpec): + transformer_layer_spec = transformer_layer_spec(self) + return MCoreGPTModel( self, - transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(self.num_moe_experts), + transformer_layer_spec=transformer_layer_spec, vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by), max_sequence_length=self.seq_length, fp16_lm_cross_entropy=self.fp16_lm_cross_entropy, @@ -225,4 +243,11 @@ def get_packed_seq_params(batch): ) -__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"] +__all__ = [ + "GPTModel", + "GPTConfig", + "gpt_data_step", + "gpt_forward_step", + "transformer_engine_layer_spec", + "local_layer_spec", +] From a70349316552f1e5ee975fd03010152a17e1982e Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:20:33 -0700 Subject: [PATCH 06/13] Update NeMo Clip to Use MCore Modules (#9594) * update clip model and config file Signed-off-by: yaoyu-33 * update clip for mcore Signed-off-by: yaoyu-33 * MCore CLIP Fix Signed-off-by: yaoyu-33 * fix no mask Signed-off-by: yaoyu-33 * few neva fixes Signed-off-by: yaoyu-33 * update siglip module Signed-off-by: yaoyu-33 * add siglip loss Signed-off-by: yaoyu-33 * fix Signed-off-by: yaoyu-33 * fix collate fn Signed-off-by: yaoyu-33 * update siglip conversion script Signed-off-by: yaoyu-33 * update siglip convert Signed-off-by: yaoyu-33 * clip fixes Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * clean up script Signed-off-by: yaoyu-33 * clip fixes Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix code styles Signed-off-by: yaoyu-33 * Update siglip_loss.py Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: yaoyu-33 --- examples/multimodal/convert_ckpt_to_nemo.py | 8 - .../clip/conf/megatron_clip_VIT-L-14.yaml | 51 +- .../clip/conf/megatron_clip_config.yaml | 3 +- .../clip/conf/megatron_clip_infer.yaml | 2 +- .../conf/megatron_siglip_so400m_14_384.yaml | 251 +++++ .../clip/convert_external_clip_to_nemo.py | 1 + .../clip/megatron_clip_pretrain.py | 7 +- .../multimodal/data/clip/clip_dataset.py | 33 +- .../multimodal/losses/siglip_loss.py | 220 +++++ .../clip/megatron_clip_models.py | 921 +++++++++++++++--- .../language_modeling/megatron_base_model.py | 2 +- nemo/collections/nlp/parts/utils_funcs.py | 15 +- .../convert_clip_hf_to_nemo.py | 248 +++++ .../convert_siglip_hf_to_nemo.py | 380 ++++++++ 14 files changed, 1996 insertions(+), 146 deletions(-) create mode 100644 examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml create mode 100644 nemo/collections/multimodal/losses/siglip_loss.py create mode 100644 scripts/checkpoint_converters/convert_clip_hf_to_nemo.py create mode 100644 scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py diff --git a/examples/multimodal/convert_ckpt_to_nemo.py b/examples/multimodal/convert_ckpt_to_nemo.py index 2bc0f5d7ab62..573bdc0bc040 100644 --- a/examples/multimodal/convert_ckpt_to_nemo.py +++ b/examples/multimodal/convert_ckpt_to_nemo.py @@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args): model = MegatronControlNet.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer ) - elif args.model_type == 'kosmos': - model = MegatronKosmosModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) - elif args.model_type == 'neva': - model = MegatronNevaModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) else: raise ValueError(f"Unrecognized model_type {args.model_type}.") diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml index d8740bb98eb2..bfee36b6c099 100644 --- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml +++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml @@ -1,3 +1,50 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: 32 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + check_val_every_n_epoch: null + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_clip + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + ema: + enable: False + decay: 0.9999 + validate_original_weights: False + every_n_steps: 1 + cpu_offload: False + model: precision: 32 # specify micro_batch_size, global_batch_size, and model parallelism @@ -19,6 +66,9 @@ model: local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix) gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue + mcore_gpt: False + transformer_engine: False + vision: precision: 32 # vision configs @@ -135,7 +185,6 @@ model: bias_activation_fusion: False megatron_legacy: True - transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml index a6b1928ef13f..f75a163a5ed2 100644 --- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml +++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml @@ -68,6 +68,8 @@ model: # numerical results as the naïve method. local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix) gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue + mcore_gpt: True + transformer_engine: True vision: precision: ${trainer.precision} @@ -183,7 +185,6 @@ model: bias_activation_fusion: False megatron_legacy: False - transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml index 215cd17841ae..3e127aa6d86a 100755 --- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml +++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml @@ -6,7 +6,7 @@ trainer: num_nodes: 1 accelerator: gpu logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 + precision: 32 # 16, 32, or bf16 model: restore_from_path: null # Path to a trained ViT .nemo file diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml new file mode 100644 index 000000000000..6c5be3a2bcd6 --- /dev/null +++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml @@ -0,0 +1,251 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: 32 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + check_val_every_n_epoch: null + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_clip + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + ema: + enable: False + decay: 0.9999 + validate_original_weights: False + every_n_steps: 1 + cpu_offload: False + +model: + precision: 32 + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 1 # limited by GPU memory + global_batch_size: 1 # will use more micro batches to reach global batch size + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + restore_from_pretrained: null # used in fine-tuning + # multimodal configs + output_dim: 1152 + # As the number of devices used to train increases, so does the space complexity of + # the logit matrix. Using a naïve all-gather scheme, space complexity will be + # `O(n^2)`. Instead, complexity may become effectively linear if the flags + # `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one + # numerical results as the naïve method. + + use_siglip: True + mcore_gpt: True + transformer_engine: True + + vision: + precision: 32 + # vision configs + patch_dim: 14 + img_h: 378 + img_w: 378 + image_mean: null + image_std: null + num_channels: 3 + drop_patch_rate: 0.0 + drop_path_rate: 0.0 + global_average_pool: False + output_dim: ${model.output_dim} + class_token_length: 0 + preprocess_layernorm: True # apply layer norm to embedded tokens + + # model architecture + encoder_seq_length: 196 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: learned_absolute + num_layers: 27 + hidden_size: 1152 + ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 16 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0. # Dropout probability for hidden state transformer. + attention_dropout: 0. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: layernorm # Type of normalization layers + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + + ## Activation Checkpointing + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + activations_checkpoint_num_layers: null # not used with 'selective' + sequence_parallel: False + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # model fusions + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism. + openai_gelu: True + bias_activation_fusion: False + megatron_legacy: True + activation: approx-gelu + + + + text: + precision: 32 + # text configs + output_dim: ${model.output_dim} + + # model architecture + encoder_seq_length: 64 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: learned_absolute + num_layers: 27 + hidden_size: 1152 + ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 16 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0. # Dropout probability for hidden state transformer. + attention_dropout: 0. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: layernorm # Type of normalization layers + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + + ## Activation Checkpointing + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + activations_checkpoint_num_layers: null # not used with 'selective' + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: False + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # model fusions + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism. + openai_gelu: True + bias_activation_fusion: False + megatron_legacy: True + + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + activation: approx-gelu + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce + + # miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + tokenizer: + library: 'huggingface' + type: 'google/siglip-so400m-patch14-384' + model: null + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + + data: + num_workers: 8 + train: + dataset_path: # List of paths to pkl files or tar files + - /datasets/coyo/test.pkl + validation: # List of paths to pkl files or tar files + dataset_path: + - /datasets/coyo/test.pkl + webdataset: + infinite_sampler: False + local_root_path: /datasets/coyo + + imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation. + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [ 0 ] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 1e-3 + weight_decay: 0.2 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 0 + min_lr: 1e-5 \ No newline at end of file diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py index b9b9ab917173..9af25181d07e 100644 --- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py +++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py @@ -283,6 +283,7 @@ def convert(local_rank, rank, world_size, args): if __name__ == '__main__': + logging.warning("This script is going to be deprecated soon. Please use ") args = get_args() local_rank, rank, world_size = initialize_distributed(args) convert(local_rank, rank, world_size, args) diff --git a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py index 4462649a5861..abca470e5843 100644 --- a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py +++ b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py @@ -22,8 +22,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="megatron_clip_config") def main(cfg) -> None: @@ -31,7 +29,10 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') assert ( - cfg.trainer.devices * cfg.trainer.num_nodes + cfg.trainer.devices + * cfg.trainer.num_nodes + // cfg.model.tensor_model_parallel_size + // cfg.model.pipeline_model_parallel_size ) * cfg.model.micro_batch_size == cfg.model.global_batch_size, ( "Gradient accumulation is not supported in CLIP yet." ) diff --git a/nemo/collections/multimodal/data/clip/clip_dataset.py b/nemo/collections/multimodal/data/clip/clip_dataset.py index 7e263e19dcc9..6b63d546194a 100644 --- a/nemo/collections/multimodal/data/clip/clip_dataset.py +++ b/nemo/collections/multimodal/data/clip/clip_dataset.py @@ -76,11 +76,18 @@ def get_preprocess_fns(model_cfg, tokenizer=None, is_train=True): img_size = (model_cfg.vision.get("img_h"), model_cfg.vision.get("img_w")) img_mean = model_cfg.vision.get("img_mean") img_std = model_cfg.vision.get("img_std") - img_transform = image_transform(img_size, is_train=is_train, mean=img_mean, std=img_std,) + img_transform = image_transform( + img_size, + is_train=is_train, + mean=img_mean, + std=img_std, + ) text_transform = lambda x: x if tokenizer is not None: text_transform = partial( - tokenize, tokenizer=tokenizer, context_length=model_cfg.text.get("max_position_embeddings"), + tokenize, + tokenizer=tokenizer, + context_length=model_cfg.text.get("max_position_embeddings"), ) return img_transform, text_transform @@ -100,7 +107,9 @@ def transform_fn(sample, img_transform, text_transform): def build_train_valid_datasets( - model_cfg, consumed_samples, tokenizer=None, + model_cfg, + consumed_samples, + tokenizer=None, ): data_cfg = model_cfg.data @@ -127,6 +136,13 @@ def build_train_valid_datasets( return train_data, val_data +def custom_collate(batch): + if len(batch) == 0: + return None, None + else: + return default_collate(batch) + + # For zero-shot imagenet validation def build_imagenet_validation_dataloader(model_cfg, tokenizer=None): val_image_transform, text_transform = get_preprocess_fns(model_cfg, tokenizer, is_train=False) @@ -138,7 +154,10 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None): if imagenet_path is None: return None - image_dataset = ImageFolder(root=imagenet_path, transform=val_image_transform,) + image_dataset = ImageFolder( + root=imagenet_path, + transform=val_image_transform, + ) image_batch_sampler = MegatronPretrainingSampler( total_samples=len(image_dataset), @@ -150,12 +169,6 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None): drop_last=False, ) - def custom_collate(batch): - if len(batch) == 0: - return None, None - else: - return default_collate(batch) - imagenet_val["images"] = torch.utils.data.DataLoader( image_dataset, batch_sampler=image_batch_sampler, diff --git a/nemo/collections/multimodal/losses/siglip_loss.py b/nemo/collections/multimodal/losses/siglip_loss.py new file mode 100644 index 000000000000..a7d2ec9b46ce --- /dev/null +++ b/nemo/collections/multimodal/losses/siglip_loss.py @@ -0,0 +1,220 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file contains code artifacts adapted from the original implementation: +# https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/loss.py + +import torch +import torch.nn.functional as F + + +def neighbour_exchange(from_rank, to_rank, tensor, group=None): + tensor_recv = torch.zeros_like(tensor) + send_op = torch.distributed.P2POp( + torch.distributed.isend, + tensor, + to_rank, + group=group, + ) + recv_op = torch.distributed.P2POp( + torch.distributed.irecv, + tensor_recv, + from_rank, + group=group, + ) + reqs = torch.distributed.batch_isend_irecv([send_op, recv_op]) + for req in reqs: + req.wait() + return tensor_recv + + +def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None): + tensor_from_left = torch.zeros_like(tensor_to_right) + tensor_from_right = torch.zeros_like(tensor_to_left) + send_op_left = torch.distributed.P2POp( + torch.distributed.isend, + tensor_to_left, + left_rank, + group=group, + ) + send_op_right = torch.distributed.P2POp( + torch.distributed.isend, + tensor_to_right, + right_rank, + group=group, + ) + recv_op_left = torch.distributed.P2POp( + torch.distributed.irecv, + tensor_from_left, + left_rank, + group=group, + ) + recv_op_right = torch.distributed.P2POp( + torch.distributed.irecv, + tensor_from_right, + right_rank, + group=group, + ) + reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left]) + for req in reqs: + req.wait() + return tensor_from_right, tensor_from_left + + +class NeighbourExchange(torch.autograd.Function): + @staticmethod + def forward(ctx, from_rank, to_rank, group, tensor): + ctx.group = group + ctx.from_rank = from_rank + ctx.to_rank = to_rank + return neighbour_exchange(from_rank, to_rank, tensor, group=group) + + @staticmethod + def backward(ctx, grad_output): + return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),) + + +def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None): + return NeighbourExchange.apply(from_rank, to_rank, group, tensor) + + +class NeighbourExchangeBidir(torch.autograd.Function): + @staticmethod + def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right): + ctx.group = group + ctx.left_rank = left_rank + ctx.right_rank = right_rank + return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group) + + @staticmethod + def backward(ctx, *grad_outputs): + return (None, None, None) + NeighbourExchangeBidir.apply( + ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs + ) + + +def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None): + return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right) + + +class SigLipLoss(torch.nn.Module): + """Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343 + + @article{zhai2023sigmoid, + title={Sigmoid loss for language image pre-training}, + author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas}, + journal={arXiv preprint arXiv:2303.15343}, + year={2023} + } + """ + + def __init__( + self, + cache_labels=False, + rank=0, + world_size=1, + group=None, + bidir=True, + ): + super().__init__() + self.cache_labels = cache_labels + self.rank = rank + self.world_size = world_size + self.group = group + self.bidir = bidir + + def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor: + labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype) + if not negative_only: + labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels + return labels + + def get_logits(self, image_features, text_features, logit_scale, logit_bias=None): + logits = logit_scale * image_features @ text_features.T + if logit_bias is not None: + logits += logit_bias + return logits + + def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False): + logits = self.get_logits(image_features, text_features, logit_scale, logit_bias) + labels = self.get_ground_truth( + image_features.device, + image_features.dtype, + image_features.shape[0], + negative_only=negative_only, + ) + loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0] + return loss + + def forward( + self, + output_tensor, + ): + image_features, text_features, logit_scale, logit_bias = output_tensor + loss = self._loss(image_features, text_features, logit_scale, logit_bias) + + if self.world_size > 1: + # exchange text features w/ neighbour world_size - 1 times + right_rank = (self.rank + 1) % self.world_size + left_rank = (self.rank - 1 + self.world_size) % self.world_size + if self.bidir: + text_features_to_right = text_features_to_left = text_features + num_bidir, remainder = divmod(self.world_size - 1, 2) + for i in range(num_bidir): + text_features_recv = neighbour_exchange_bidir_with_grad( + left_rank, + right_rank, + text_features_to_left, + text_features_to_right, + group=self.group, + ) + + for f in text_features_recv: + loss += self._loss( + image_features, + f, + logit_scale, + logit_bias, + negative_only=True, + ) + text_features_to_left, text_features_to_right = text_features_recv + + if remainder: + text_features_recv = neighbour_exchange_with_grad( + left_rank, right_rank, text_features_to_right, group=self.group + ) + + loss += self._loss( + image_features, + text_features_recv, + logit_scale, + logit_bias, + negative_only=True, + ) + else: + text_features_to_right = text_features + for i in range(self.world_size - 1): + text_features_from_left = neighbour_exchange_with_grad( + left_rank, right_rank, text_features_to_right, group=self.group + ) + + loss += self._loss( + image_features, + text_features_from_left, + logit_scale, + logit_bias, + negative_only=True, + ) + text_features_to_right = text_features_from_left + return loss, {"loss": loss} diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index 7be7407b98ae..a83960307672 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -13,12 +13,17 @@ # limitations under the License. import itertools -from functools import partial +import os +import warnings +from contextlib import nullcontext +from dataclasses import fields +from functools import cache, partial from typing import Any, Optional import numpy as np import torch import torch.nn.functional as F +from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.trainer.trainer import Trainer @@ -29,7 +34,9 @@ build_train_valid_datasets, ) from nemo.collections.multimodal.losses.clip_loss import ClipLoss +from nemo.collections.multimodal.losses.siglip_loss import SigLipLoss from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import get_specs, mcore_supports_moe from nemo.collections.nlp.modules.common.megatron.build_model import build_model from nemo.collections.nlp.modules.common.megatron.language_model import get_language_model from nemo.collections.nlp.modules.common.megatron.module import Float16Module, MegatronModule @@ -40,7 +47,7 @@ init_method_normal, scaled_init_method_normal, ) -from nemo.collections.nlp.parts.utils_funcs import get_last_rank, torch_dtype_from_precision +from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank from nemo.collections.vision.modules.vit.vit_backbone import VitBackbone from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -55,7 +62,33 @@ try: from megatron.core import parallel_state + from megatron.core.distributed import DistributedDataParallel as McoreDDP + from megatron.core.distributed import DistributedDataParallelConfig + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.models.gpt import GPTModel as MCoreGPTModel + from megatron.core.models.vision.clip_vit_model import CLIPViTModel from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TENorm, + TERowParallelLinear, + ) + from megatron.core.transformer.enums import AttnMaskType as MCoreAttnMaskType + from megatron.core.transformer.identity_op import IdentityOp + from megatron.core.transformer.mlp import MLP, MLPSubmodules + from megatron.core.transformer.module import Float16Module as MCoreFloat16Module + from megatron.core.transformer.spec_utils import ModuleSpec + from megatron.core.transformer.transformer_config import TransformerConfig + from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + from megatron.core.utils import ( + drain_embedding_wgrad_compute, + get_model_config, + init_method_normal, + scaled_init_method_normal, + ) HAVE_MEGATRON_CORE = True @@ -63,6 +96,28 @@ HAVE_MEGATRON_CORE = False +try: + import transformer_engine + from transformer_engine.pytorch import module as te_module + + HAVE_TE = True + +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + + +@cache +def mcore_supports_moe() -> bool: + global HAVE_MEGATRON_CORE + if not HAVE_MEGATRON_CORE: + return False + try: + from megatron.core.transformer.moe.router import TopKRouter + + return True + except ImportError: + return False + class CLIPVisionTransformer(MegatronModule): """Vision Transformer Model.""" @@ -100,7 +155,11 @@ def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_proc if self.post_process and not skip_head: self.output_dim = model_cfg.output_dim - self.head = torch.nn.Linear(self.hidden_size, self.output_dim, bias=False,) + self.head = torch.nn.Linear( + self.hidden_size, + self.output_dim, + bias=False, + ) def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" @@ -129,7 +188,6 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc self.pre_process = pre_process self.post_process = post_process self.fp16_lm_cross_entropy = model_cfg.fp16_lm_cross_entropy - self.sequence_parallel = model_cfg.sequence_parallel self.gradient_accumulation_fusion = model_cfg.gradient_accumulation_fusion scaled_init_method = ( @@ -173,7 +231,7 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc openai_gelu=model_cfg.openai_gelu, onnx_safe=model_cfg.onnx_safe, megatron_legacy=model_cfg.megatron_legacy, - transformer_engine=model_cfg.transformer_engine, + transformer_engine=False, fp8=model_cfg.fp8, fp8_e4m3=model_cfg.fp8_e4m3, fp8_hybrid=model_cfg.fp8_hybrid, @@ -193,14 +251,17 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc hidden_size=model_cfg.hidden_size, ) - # TODO (yuya): check this position id self.position_ids = None if self.pre_process: self.position_ids = torch.arange(model_cfg.max_position_embeddings).expand(1, -1).cuda() if self.post_process: self.output_dim = model_cfg.output_dim - self.head = torch.nn.Linear(model_cfg.hidden_size, self.output_dim, bias=False,) + self.head = torch.nn.Linear( + model_cfg.hidden_size, + self.output_dim, + bias=False, + ) self.attn_mask = self.build_attention_mask(model_cfg.max_position_embeddings) @@ -217,7 +278,8 @@ def build_attention_mask(self, max_position_embeddings): return mask def forward( - self, input_ids, + self, + input_ids, ): # input_ids: [b, s] # position_ids: [b, s] @@ -245,27 +307,263 @@ def forward( return hidden_states +class SiglipMHAPoolingHead(TransformerLayer): + """Multihead Attention Pooling.""" + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + ): + super().__init__(config, submodules) + + self.probe = torch.nn.Parameter(torch.randn(1, 1, config.hidden_size)) + + def forward(self, hidden_state): + batch_size = hidden_state.shape[0] + # [s, b, h] + probe = self.probe.repeat(1, batch_size, 1) + hidden_state = hidden_state.transpose(0, 1) + hidden_state, context = super().forward( + probe, + attention_mask=None, + context=hidden_state, + ) + + return hidden_state[0] + + +class MCoreSiglipViTModel(CLIPViTModel): + def __init__(self, *args, **kwargs): + # TODO (yuya): need to handle post_process correctly in order to enable PP + self.output_dim = kwargs.pop('output_dim') + kwargs['ln_pre_impl'] = IdentityOp + super().__init__(*args, **kwargs) + assert self.output_dim == self.config.hidden_size, "Siglip output_dim needs to be the same as hidden_size." + self.conv1 = torch.nn.Conv2d( + in_channels=3, + out_channels=self.visual_hidden_size, + kernel_size=self.patch_dim, + stride=self.patch_dim, + bias=True, + ) + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + + self.head = SiglipMHAPoolingHead( + self.config, + submodules=TransformerLayerSubmodules( + cross_attention=ModuleSpec( + module=CrossAttention, + params={"attn_mask_type": MCoreAttnMaskType.no_mask}, + submodules=CrossAttentionSubmodules( + linear_q=TEColumnParallelLinear, + linear_kv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + cross_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), + ) + + def forward(self, x): + x = super().forward( + x, + ) + x = self.final_layernorm(x) + x = self.head(x) + return x + + +class MCoreSiglipTextModel(MCoreGPTModel): + def __init__(self, *args, **kwargs): + # TODO (yuya): need to handle post_process correctly in order to enable PP + self.output_dim = kwargs.pop('output_dim') + kwargs['transformer_layer_spec'].submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask + + super().__init__(*args, **kwargs) + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + self.head = torch.nn.Linear( + self.config.hidden_size, + self.output_dim, + bias=True, + ) + + self.position_ids = None + if self.pre_process: + self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda() + + def forward(self, input_ids): + + x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None) + x = self.final_layernorm(x) + x = x[-1] + x = self.head(x) + return x + + +class MCoreCLIPViTModel(CLIPViTModel): + def __init__(self, *args, **kwargs): + # TODO (yuya): need to handle post_process correctly in order to enable PP + self.output_dim = kwargs.pop('output_dim') + super().__init__(*args, **kwargs) + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + self.head = torch.nn.Linear( + self.config.hidden_size, + self.output_dim, + bias=False, + ) + + def forward(self, x): + x = super().forward( + x, + ) + x = self.final_layernorm(x) + x = x[:, 0] + x = self.head(x) + return x + + +class MCoreCLIPTextModel(MCoreGPTModel): + def __init__(self, *args, **kwargs): + # TODO (yuya): need to handle post_process correctly in order to enable PP + self.output_dim = kwargs.pop('output_dim') + + super().__init__(*args, **kwargs) + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + self.head = torch.nn.Linear( + self.config.hidden_size, + self.output_dim, + bias=False, + ) + self.position_ids = None + if self.pre_process: + self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda() + + def forward(self, input_ids): + x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None) + x = self.final_layernorm(x) + x = x[input_ids.argmax(dim=-1), torch.arange(x.shape[1])] + x = self.head(x) + return x + + class CLIPModel(MegatronModule): """CLIP Model""" - def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_process=True, post_process=True): + def __init__( + self, + model_cfg, + model_parallel_config, + vision_transformer_config, + text_transformer_config, + padded_vocab_size, + pre_process=True, + post_process=True, + ): super(CLIPModel, self).__init__() self.config = model_parallel_config + self.use_siglip = model_cfg.get("use_siglip", False) self.pre_process = pre_process self.post_process = post_process - self.vision_encoder = CLIPVisionTransformer( - model_cfg.vision, model_parallel_config, pre_process=self.pre_process, post_process=self.post_process, - ) - self.text_encoder = CLIPTextTransformer( - model_cfg.text, - model_parallel_config, - padded_vocab_size, - pre_process=self.pre_process, - post_process=self.post_process, - ) + self.output_dim = model_cfg.output_dim + self.get_attention_mask_from_fusion = model_cfg.get('get_attention_mask_from_fusion', True) - self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + if model_cfg.get("mcore_gpt", False): + if model_cfg.vision.get("class_token_length") is None or model_cfg.vision.get("class_token_length") <= 0: + add_class_token = False + else: + add_class_token = True + vision_layer_spec = get_specs( + model_cfg.text.get('name', ''), + vision_transformer_config.num_moe_experts, + vision_transformer_config.moe_grouped_gemm, + model_cfg.get('transformer_engine', True), + ) + vision_layer_spec.submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask + + if model_cfg.get("use_siglip", False): + vision_module = MCoreSiglipViTModel + text_module = MCoreSiglipTextModel + else: + vision_module = MCoreCLIPViTModel + text_module = MCoreCLIPTextModel + self.vision_encoder = vision_module( + transformer_config=vision_transformer_config, + transformer_layer_spec=vision_layer_spec, + patch_dim=model_cfg.vision.get('patch_dim', 16), + img_h=model_cfg.vision.get('img_h', 224), + img_w=model_cfg.vision.get('img_w', 224), + add_class_token=add_class_token, + class_token_len=model_cfg.vision.get('class_token_length'), + output_dim=model_cfg.output_dim, + ) + self.text_encoder = text_module( + config=text_transformer_config, + transformer_layer_spec=get_specs( + model_cfg.text.get('name', ''), + text_transformer_config.num_moe_experts, + text_transformer_config.moe_grouped_gemm, + model_cfg.get('transformer_engine', True), + ), + vocab_size=model_cfg.text.get('override_vocab_size', padded_vocab_size), + max_sequence_length=model_cfg.text.get('encoder_seq_length', 512), + pre_process=pre_process, + post_process=False, + parallel_output=True, + share_embeddings_and_output_weights=False, + position_embedding_type=model_cfg.text.get('position_embedding_type', 'learned_absolute'), + rotary_percent=model_cfg.text.get('rotary_percentage', 1.0), + seq_len_interpolation_factor=model_cfg.text.get('seq_len_interpolation_factor', None), + rotary_base=model_cfg.text.get('rotary_base', 10000), + output_dim=model_cfg.output_dim, + ) + + else: + self.vision_encoder = CLIPVisionTransformer( + model_cfg.vision, + model_parallel_config, + pre_process=self.pre_process, + post_process=self.post_process, + ) + self.text_encoder = CLIPTextTransformer( + model_cfg.text, + model_parallel_config, + padded_vocab_size, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + if self.use_siglip: + self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(10)) + self.logit_bias = torch.nn.Parameter(torch.ones([]) * (-10)) + else: + self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" @@ -277,10 +575,89 @@ def forward(self, images, captions): text_features = self.text_encoder(captions) if self.post_process: + if self.use_siglip: + return ( + F.normalize(image_features, dim=-1), + F.normalize(text_features, dim=-1), + self.logit_scale.exp(), + self.logit_bias, + ) return F.normalize(image_features, dim=-1), F.normalize(text_features, dim=-1), self.logit_scale.exp() return image_features, text_features + def build_transformer_config(self) -> TransformerConfig: + """Builds the megatron core gpt transformer config for the model. + For attributes in the nemo model config that are the same + as the megatron core TransformerConfig, we will use the value from the nemo model config. + For attributes in TransformerConfig that are not in the nemo model config, we add custom logic. + """ + + normalization = self.cfg.get('normalization', 'layernorm').lower() + layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' + if normalization == 'layernorm': + normalization = 'LayerNorm' + elif normalization == 'rmsnorm': + normalization = 'RMSNorm' + elif normalization == 'layernorm1p': + normalization = 'LayerNorm' + layernorm_zero_centered_gamma = True + else: + logging.warning( + f"The normalization type: {normalization} might not be supported in megatron core." + f"Supported types are LayerNorm and RMSNorm." + ) + + ub_tp_comm_overlap = self.cfg.get('ub_tp_comm_overlap', False) + + if not self.cfg.get('fp8', False): + fp8 = None + elif self.cfg.get('fp8_e4m3', False): + fp8 = 'e4m3' + elif self.cfg.get('fp8_hybrid', False): + fp8 = 'hybrid' + else: + raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.") + + # any configs that are not in the nemo model config will be added here + model_specific_configs = { + 'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma, + 'normalization': normalization, + 'fp8': fp8, + 'tp_comm_overlap': ub_tp_comm_overlap, + # MoE related + 'num_moe_experts': self.cfg.get('num_moe_experts', None), + 'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'), + 'moe_router_topk': self.cfg.get('moe_router_topk', 2), + 'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False), + 'moe_aux_loss_coeff': self.cfg.get( + 'moe_aux_loss_coeff', 0 + ), # 1e-2 would be a good start value for load balance loss. + 'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss + 'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None), + 'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping. + } + if model_specific_configs['num_moe_experts'] is not None: + assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE' + elif not mcore_supports_moe(): + if 'num_moe_experts' in model_specific_configs: + del model_specific_configs['num_moe_experts'] + moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys())) + for k in moe_keys: + del model_specific_configs[k] + + transformer_config = super().build_transformer_config() + + for key, value in model_specific_configs.items(): + setattr(transformer_config, key, value) + + # pass mcore customization configs directly to mcore + mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {}) + for key, value in mcore_customization_config_dict.items(): + setattr(transformer_config, key, value) + + return transformer_config + class MegatronCLIPModel(MegatronBaseModel): """Megatron CLIP Model.""" @@ -302,11 +679,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._validate_trainer() + # placeholder for O2 wrapper + self.transformer_config = self.build_transformer_config(self.cfg.text) + self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False) + self.mcore_gpt = cfg.get('mcore_gpt', False) + if cfg.get('fp8', False): + self.prev_step_training = True if not self.megatron_amp_O2 and self.cfg.get('virtual_pipeline_model_parallel_size', None): raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2') + self.transformer_engine = cfg.get('transformer_engine', False) + if self.megatron_amp_O2 and not self.transformer_engine: + logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.') + # build_model returns a list of modules which are used for interleaved pipeline parallelism if isinstance(self.trainer.accelerator, CPUAccelerator): self.model = build_model( @@ -316,19 +703,24 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), ) else: - self.model = build_model( - model_provider_func=self.model_provider_func, - wrap_with_ddp=False, - virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), - ) + build_model_context = nullcontext + if HAVE_TE and self.cfg.get('fp8', False) and self.cfg.get('fp8_params', False): + build_model_context = transformer_engine.pytorch.fp8_model_init + with build_model_context(): + self.model = build_model( + model_provider_func=self.model_provider_func, + wrap_with_ddp=False, + virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), + on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False), + ) # if we're not using interleaved, then self.model is a module. - if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None: + if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None and (not self.use_mcore_dist_optim): self.model = self.model[0] if self.megatron_amp_O2: - if not self.with_distributed_adam: + if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False): # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type if isinstance(self.model, list): for module in self.model: @@ -336,31 +728,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): else: self.model.cuda(torch.cuda.current_device()) - # Model wrapper to convert both model and inputs to half precision - # TODO (yuya): check this; FP16 Module might not work; when self.model is a list? - if isinstance(self.model, list): - converted_model = [] - for module in self.model: - converted_model.append( - Float16Module(config=self.model_parallel_config, module=module, precision=cfg.precision) - ) - self.model = converted_model - else: - self.model = Float16Module( - config=self.model_parallel_config, module=self.model, precision=cfg.precision - ) + self._wrap_model_for_O2() - self.autocast_dtype = torch_dtype_from_precision(self.trainer.precision) self.enable_autocast = ( True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False ) - self.transformer_engine = cfg.get('transformer_engine', False) - # Convert the global-batch-based profile index to micro-batch index if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'): mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1) - data_parallel_world_size = trainer.world_size // mp_size + cp_size = cfg.get('context_parallel_size', 1) + data_parallel_world_size = trainer.world_size // (mp_size * cp_size) grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size) if hasattr(self, '_nsys_profile_enabled'): self._nsys_profile_start_step *= grad_accum_steps @@ -368,22 +746,36 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if hasattr(self, '_memory_profile_enabled'): self._memory_profile_start_step *= grad_accum_steps self._memory_profile_end_step *= grad_accum_steps - self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True) - self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False) - def get_module_list(self): - if isinstance(self.model, list): - return [model.module if isinstance(model, Float16Module) else model for model in self.model] - elif isinstance(self.model, Float16Module): - return [self.model.module] - else: - return [self.model] + self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False) + self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1))) + self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0))) + self.loss_broadcast_src_rank = None + data_cfg = cfg.get('data', {}) + self.return_output_tensors = data_cfg.get('return_output_tensors', False) + self.validation_drop_last = data_cfg.get('validation_drop_last', True) + self.sample_weight = data_cfg.get('sample_weight', 'token') + self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" + vision_transformer_config = self.build_transformer_config(self.cfg.vision) if self.mcore_gpt else None + text_transformer_config = self.build_transformer_config(self.cfg.text) if self.mcore_gpt else None + + if self.mcore_gpt and not parallel_state.is_initialized(): + + def dummy(): + return + + if self.trainer.strategy.launcher is not None: + self.trainer.strategy.launcher.launch(dummy, trainer=self.trainer) + self.trainer.strategy.setup_environment() + model = CLIPModel( model_cfg=self.cfg, model_parallel_config=self.model_parallel_config, + vision_transformer_config=vision_transformer_config, + text_transformer_config=text_transformer_config, padded_vocab_size=self.padded_vocab_size, pre_process=pre_process, post_process=post_process, @@ -401,9 +793,40 @@ def setup_optimizer_param_groups(self): else: self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) + def setup_mcore_distributed_parallel(self): + """Set up mcore distributed data parallel""" + if self.with_distributed_adam and self.use_mcore_dist_optim: + config = get_model_config(self.model[0]) + ddp_config = DistributedDataParallelConfig( + grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'), + overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False), + use_distributed_optimizer=True, + check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False), + # mcore bucket_size is based on num of parameters, therefore not + # using bucket_cap_mb to configure bucket_size here + bucket_size=self.cfg.optim.get('ddp_bucket_size', None), + ) + + self.model = [ + McoreDDP( + config, + ddp_config, + model_chunk, + data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0), + ) + for (model_chunk_idx, model_chunk) in enumerate(self.model) + ] + + # (TODO) Broadcast params from data parallel src rank to other data parallel ranks. + # by calling model_module.broadcast_params() if the model is randomly initialized. + def configure_optimizers(self): - if self.with_distributed_adam: + if self.with_distributed_adam and not self.use_mcore_dist_optim: # Disable overlapped grad sync for layer norm grads when # sequence parallelism is enabled @@ -462,13 +885,16 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): no_sync_func = None grad_sync_func = None param_sync_func = None - if not forward_only and self.with_distributed_adam: - no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) + if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim: + no_sync_func = partial( + self._optimizer.no_sync, + greedy_grad_copy=self.megatron_amp_O2, + ) grad_sync_func = self.reduce_overlap_gradients param_sync_func = self.sync_overlap_parameters # pipeline schedules will get these from self.model.config - for module in self.get_module_list(): + for module in self.get_model_module_list(): module.config.no_sync_func = no_sync_func module.config.grad_sync_func = grad_sync_func module.config.param_sync_func = param_sync_func @@ -515,7 +941,9 @@ def initialize_ub_func(self): ) input_shape = [ - self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), + self.cfg.get('encoder_seq_length') + * self.cfg.get('micro_batch_size') + // self.cfg.get('context_parallel_size', 1), self.cfg.get('hidden_size'), ] @@ -529,12 +957,12 @@ def initialize_ub_func(self): def training_step(self, dataloader_iter): """ - Our dataloaders produce a micro-batch and then we fetch - a number of microbatches depending on the global batch size and model parallel size - from the dataloader to produce a list of microbatches. - Batch should be a list of microbatches and those microbatches should on CPU. - Microbatches are then moved to GPU during the pipeline. - The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. + Our dataloaders produce a micro-batch and then we fetch + a number of microbatches depending on the global batch size and model parallel size + from the dataloader to produce a list of microbatches. + Batch should be a list of microbatches and those microbatches should on CPU. + Microbatches are then moved to GPU during the pipeline. + The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ # Initialize userbuffer communicators. if self.initialize_ub: @@ -543,7 +971,7 @@ def training_step(self, dataloader_iter): # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() - if self.with_distributed_adam: + if self.with_distributed_adam and not self.use_mcore_dist_optim: # hack to enable overlapping param sync and forward compute # note: the distributed optimizer monkey-patches each # parameter's __getattribute__ function so that it can @@ -554,9 +982,10 @@ def training_step(self, dataloader_iter): # manually interact with the parameter. modules = self.model if isinstance(self.model, list) else [self.model] for module in modules: - if isinstance(module, Float16Module): + if isinstance(module, (Float16Module, MCoreFloat16Module)): module = module.module - module = module.text_encoder.language_model + if not self.mcore_gpt: + module = module.language_model if hasattr(module, 'embedding'): for param in module.embedding.parameters(): param.data_ptr() @@ -567,38 +996,115 @@ def training_step(self, dataloader_iter): if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): self.allreduce_sequence_parallel_gradients() - if self.with_distributed_adam: - # synchronize asynchronous grad reductions - # note: not necessary, but reduces performance degradation - # from multiple simultaneous NCCL calls - self._optimizer._finish_bucket_grad_sync() + if self.cfg.get('fp8', False): + self.prev_step_training = self.training + + # Optimization: Defer the embedding GEMM Wgrads of the last PP stage to pipeline flush waiting time + if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage( + ignore_virtual=True + ): + if ( + self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt + ): # Silently ignore the optimization if MCORE is not used + module_list = self.get_model_module_list() + if len(module_list) > 1: + embedding_module = module_list[-1] + else: + embedding_module = module_list[0] + + embedding_activation_buffer = embedding_module.embedding_activation_buffer + grad_output_buffer = embedding_module.grad_output_buffer + weight = embedding_module.output_layer.weight + + drain_embedding_wgrad_compute( + embedding_module.config, embedding_activation_buffer, grad_output_buffer, weight + ) + + # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced + if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): + self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1) + self.allreduce_sequence_parallel_gradients() + self.megatron_timer_stop('allreduce_sequence_parallel_gradients') + + self.megatron_timer_start('gradient_allreduce', log_level=1) + if self.use_fsdp: + # Reduce the gradients omitted from FSDP-sharding + self.allreduce_fsdp_sharding_omitted_gradients() + elif self.with_distributed_adam: + if not self.use_mcore_dist_optim: + # synchronize asynchronous grad reductions + # note: not necessary, but reduces performance degradation + # from multiple simultaneous NCCL calls + self._optimizer._finish_bucket_grad_sync() + # else: Mcore distributed optim calls finalize_model_grads to finish grad sync elif self.megatron_amp_O2: # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously) - # if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False): - # # main grads are stored in the MainParamsOptimizer wrapper - self._optimizer.allreduce_main_grads() + if ( + self.cfg.get('pipeline_model_parallel_size', 1) > 1 + or self.cfg.get('sequence_parallel', False) + or not self.cfg.get('async_grad_allreduce', True) + ): + # main grads are stored in the MainParamsOptimizer wrapper + self._optimizer.allreduce_main_grads() else: # async grad allreduce is not currently implemented for O1/autocasting mixed precision training # so we all-reduce gradients after the pipeline self.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf) + self.megatron_timer_stop('gradient_allreduce') + + if ( + not self.use_mcore_dist_optim + and self.cfg.get('pipeline_model_parallel_size', 1) > 1 + and self.cfg.get('share_embeddings_and_output_weights', True) + ): + self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1) + # when using pipeline parallelism the first and last stage must keep embeddings in sync + self.allreduce_first_last_embeddings() + self.megatron_timer_stop('allreduce_first_last_embeddings') + + if self.log_memory_usage: + mem_reserved = torch.cuda.max_memory_reserved() + self.log( + 'peak_memory_usage', + mem_reserved, + prog_bar=True, + rank_zero_only=True, + batch_size=1, + ) ## logging - # we can only log on one rank if it is rank zero so we broadcast from last rank - # we can avoid this broadcast by updating the PTL log function to accept specific ranks - torch.distributed.broadcast(loss_mean, get_last_rank()) - - if self.cfg.precision in [16, '16', '16-mixed']: - loss_scale = self.trainer.precision_plugin.scaler._scale - if loss_scale is not None: - self.log('loss_scale', loss_scale, batch_size=1) + if self.log_train_loss: + # When using pipeline parallelism, loss is calculated only in the last pipeline stage and + # it should be casted to other pipeline stages for logging. + # we can avoid this broadcast by updating the PTL log function to accept specific ranks + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + if torch.distributed.get_rank() == get_last_rank(): + torch.distributed.send(loss_mean, 0) + elif torch.distributed.get_rank() == 0: + torch.distributed.recv(loss_mean, get_last_rank()) + self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1) + + # (@adithyare) we need to check for the _scaler attribute to enable pp>1 for adapter training + if self.cfg.precision == 16 and hasattr(self.trainer.precision_plugin.scaler, "_scale"): + loss_scale = self.trainer.precision_plugin.scaler._scale + if loss_scale is not None: + self.log('loss_scale', loss_scale, batch_size=1) - self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1) lr = self._optimizer.param_groups[0]['lr'] self.log('lr', lr, rank_zero_only=True, batch_size=1) - self.log('global_step', self.trainer.global_step + 1, prog_bar=True, rank_zero_only=True, batch_size=1) + self.log( + 'global_step', + self.trainer.global_step + 1, + prog_bar=True, + rank_zero_only=True, + batch_size=1, + ) + + consumed_samples = self._compute_consumed_samples_after_training_step() + # TODO: make sure compute_consumed_samples works for pipeline parallelism self.log( 'consumed_samples', - self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step), + consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1, @@ -607,20 +1113,20 @@ def training_step(self, dataloader_iter): return loss_mean def backward(self, *args, **kwargs): - """ LightningModule hook to do backward. - We want this to do nothing since we run backward in the fwd/bwd functions from apex. - No need to call it here. + """LightningModule hook to do backward. + We want this to do nothing since we run backward in the fwd/bwd functions from apex. + No need to call it here. """ pass def optimizer_zero_grad(self, *args, **kwargs): - """ LightningModule hook to zero grad. - We want this to do nothing as we are zeroing grads during the training_step. + """LightningModule hook to zero grad. + We want this to do nothing as we are zeroing grads during the training_step. """ pass def _append_sequence_parallel_module_grads(self, module, grads): - """ Helper method for allreduce_sequence_parallel_gradients""" + """Helper method for allreduce_sequence_parallel_gradients""" for param in module.parameters(): sequence_parallel_param = getattr(param, 'sequence_parallel', False) @@ -632,9 +1138,9 @@ def _append_sequence_parallel_module_grads(self, module, grads): grads.append(grad.data) def allreduce_sequence_parallel_gradients(self): - """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used. - Modified from megatron-lm: - https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425 + """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used. + Modified from megatron-lm: + https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425 """ grads = [] @@ -650,7 +1156,18 @@ def allreduce_sequence_parallel_gradients(self): buf.copy_(synced) def get_forward_output_and_loss_func(self): - loss_func = ClipLoss(local_loss=self.cfg.local_loss, gather_with_grad=self.cfg.gather_with_grad,) + if self.cfg.get("use_siglip", False): + # TODO(yuya): fix rank + loss_func = SigLipLoss( + rank=parallel_state.get_data_parallel_rank(), + world_size=parallel_state.get_data_parallel_world_size(), + group=parallel_state.get_data_parallel_group(), + ) + else: + loss_func = ClipLoss( + local_loss=self.cfg.local_loss, + gather_with_grad=self.cfg.gather_with_grad, + ) def fwd_output_and_loss_func(dataloader_iter, model): batch, _, _ = next(dataloader_iter) @@ -690,7 +1207,8 @@ def zero_shot_classifier(self): texts = texts.cuda(non_blocking=True) # TODO (yuya): distributed not working with torch.cuda.amp.autocast( - enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype, + enabled=self.autocast_dtype in (torch.half, torch.bfloat16), + dtype=self.autocast_dtype, ): class_embeddings = text_encoder(texts) class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0) @@ -726,7 +1244,8 @@ def accuracy(output, target, topk=(1,)): target = target.cuda(non_blocking=True) # predict with torch.cuda.amp.autocast( - enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype, + enabled=self.autocast_dtype in (torch.half, torch.bfloat16), + dtype=self.autocast_dtype, ): image_features = vision_encoder(images) image_features = F.normalize(image_features, dim=-1) @@ -745,10 +1264,10 @@ def accuracy(output, target, topk=(1,)): def validation_step(self, dataloader_iter): """ - Our dataloaders produce a micro-batch and then we fetch - a number of microbatches depending on the global batch size and model parallel size - from the dataloader to produce a list of microbatches. - The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ + Our dataloaders produce a micro-batch and then we fetch + a number of microbatches depending on the global batch size and model parallel size + from the dataloader to produce a list of microbatches. + The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.""" # Initialize userbuffer communicators. if self.initialize_ub: self.initialize_ub_func() @@ -801,7 +1320,9 @@ def build_train_valid_test_datasets(self): raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.") self._train_ds, self._validation_ds = build_train_valid_datasets( - model_cfg=self.cfg, consumed_samples=self.compute_consumed_samples(0), tokenizer=self.tokenizer, + model_cfg=self.cfg, + consumed_samples=self.compute_consumed_samples(0), + tokenizer=self.tokenizer, ) self._test_ds = None @@ -816,7 +1337,7 @@ def build_train_valid_test_datasets(self): return self._train_ds, self._validation_ds, self._test_ds def setup(self, stage=None): - """ PTL hook that is executed after DDP spawns. + """PTL hook that is executed after DDP spawns. We setup datasets here as megatron datasets require DDP to instantiate. See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. Args: @@ -909,23 +1430,18 @@ def setup_test_data(self, cfg): f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' ) self._test_dl = torch.utils.data.DataLoader( - self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True, + self._test_ds, + batch_size=self._micro_batch_size, + num_workers=cfg.num_workers, + pin_memory=True, ) def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: raise NotImplementedError - def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: - """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device - When using pipeline parallelism, we need the global batch to remain on the CPU, - since the memory overhead will be too high when using a large number of microbatches. - Microbatches are transferred from CPU to GPU inside the pipeline. - """ - return batch - def _validate_trainer(self): - """ Certain trainer configurations can break training. - Here we try to catch them and raise an error. + """Certain trainer configurations can break training. + Here we try to catch them and raise an error. """ if self.trainer.accumulate_grad_batches > 1: raise ValueError( @@ -961,3 +1477,178 @@ def parameters(self): return itertools.chain.from_iterable(module.parameters() for module in self.model) else: return self.model.parameters() + + def build_transformer_config(self, model_cfg=None) -> TransformerConfig: + """Builds the megatron core gpt transformer config for the model. + For attributes in the nemo model config that are the same + as the megatron core TransformerConfig, we will use the value from the nemo model config. + For attributes in TransformerConfig that are not in the nemo model config, we add custom logic. + """ + if model_cfg is None: + model_cfg = self.cfg + normalization = model_cfg.get('normalization', 'layernorm').lower() + layernorm_zero_centered_gamma = model_cfg.get('normalization', 'layernorm') == 'layernorm1p' + if normalization == 'layernorm': + normalization = 'LayerNorm' + elif normalization == 'rmsnorm': + normalization = 'RMSNorm' + elif normalization == 'layernorm1p': + normalization = 'LayerNorm' + layernorm_zero_centered_gamma = True + else: + logging.warning( + f"The normalization type: {normalization} might not be supported in megatron core." + f"Supported types are LayerNorm and RMSNorm." + ) + + ub_tp_comm_overlap = model_cfg.get('ub_tp_comm_overlap', False) + + if not model_cfg.get('fp8', False): + fp8 = None + elif model_cfg.get('fp8_e4m3', False): + fp8 = 'e4m3' + elif model_cfg.get('fp8_hybrid', False): + fp8 = 'hybrid' + else: + raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.") + + # any configs that are not in the nemo model config will be added here + model_specific_configs = { + 'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma, + 'normalization': normalization, + 'fp8': fp8, + 'tp_comm_overlap': ub_tp_comm_overlap, + # MoE related + 'num_moe_experts': model_cfg.get('num_moe_experts', None), + 'moe_router_load_balancing_type': model_cfg.get('moe_router_load_balancing_type', 'aux_loss'), + 'moe_router_topk': model_cfg.get('moe_router_topk', 2), + 'moe_grouped_gemm': model_cfg.get('moe_grouped_gemm', False), + 'moe_aux_loss_coeff': model_cfg.get( + 'moe_aux_loss_coeff', 0 + ), # 1e-2 would be a good start value for load balance loss. + 'moe_z_loss_coeff': model_cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss + 'moe_input_jitter_eps': model_cfg.get('moe_input_jitter_eps', None), + 'moe_token_dropping': model_cfg.get('moe_token_dropping', False), # TODO: Support token dropping. + } + if model_specific_configs['num_moe_experts'] is not None: + assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE' + elif not mcore_supports_moe(): + if 'num_moe_experts' in model_specific_configs: + del model_specific_configs['num_moe_experts'] + moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys())) + for k in moe_keys: + del model_specific_configs[k] + + # create a dictionary copy of the model config + cfg = OmegaConf.to_container(model_cfg, resolve=True) + + # create a dict to store the transformer config arguments + transformer_config_dict = {} + + # get model parallel configs from the base class + model_parallel_config = self.build_model_parallel_config() + + add_bias_linear = model_cfg.get('bias', True) + add_qkv_bias = model_cfg.get('qkv_bias', False) + + activation = model_cfg.get('activation', 'gelu') + gated_linear_unit = activation.endswith('glu') + # TODO: need to check which activation functions are supported in mcore + activation_func = activation_to_func(activation, openai_gelu=model_cfg.get("openai_gelu", False)) + + normalization = model_cfg.get('normalization', 'LayerNorm') + + init_method_std = model_cfg.get('init_method_std', 0.02) + # default used in mcore + init_method = init_method_normal(init_method_std) + + output_layer_init_method = init_method + num_layers = model_cfg.get('num_layers', 1) + use_scaled_init_method = model_cfg.get('use_scaled_init_method', True) + if use_scaled_init_method: + output_layer_init_method = scaled_init_method_normal(init_method_std, num_layers=num_layers) + + attention_softmax_in_fp32 = False # not currently used in NeMo unless apply_query_key_layer_scaling is True + apply_query_key_layer_scaling = model_cfg.get('apply_query_key_layer_scaling', False) + + rotary_interleaved = model_cfg.get('rotary_interleaved', False) + + fp16_enabled = self.trainer.precision in [16, '16', '16-mixed'] + if apply_query_key_layer_scaling: + if fp16_enabled: + os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "1" + else: + logging.warning( + "apply_query_key_layer_scaling is only enabled when using FP16, setting it to False " + "and setting NVTE_APPLY_QK_LAYER_SCALING=0" + ) + os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "0" + apply_query_key_layer_scaling = False + + if apply_query_key_layer_scaling: + attention_softmax_in_fp32 = True + + bias_activation_fusion = model_cfg.get('bias_activation_fusion', True) + + bias_dropout_fusion = model_cfg.get('bias_dropout_add_fusion', True) + + apply_rope_fusion = model_cfg.get('apply_rope_fusion', False) + + # TODO: need to check if recompute APIs are matching up properly + recompute_granularity = model_cfg.get('activations_checkpoint_granularity', None) + recompute_method = model_cfg.get('activations_checkpoint_method', None) + recompute_num_layers = model_cfg.get('activations_checkpoint_num_layers', None) + + # any configs that are not in the nemo model config will be added here + config_mapping = { + 'apply_query_key_layer_scaling': apply_query_key_layer_scaling, + 'apply_residual_connection_post_layernorm': False, # we don't use this in NeMo + 'layernorm_zero_centered_gamma': False, + 'add_bias_linear': add_bias_linear, + 'add_qkv_bias': add_qkv_bias, + 'gated_linear_unit': gated_linear_unit, + 'activation_func': activation_func, + 'normalization': normalization, + 'init_method': init_method, + 'output_layer_init_method': output_layer_init_method, + 'attention_softmax_in_fp32': attention_softmax_in_fp32, + 'bias_activation_fusion': bias_activation_fusion, + 'bias_dropout_fusion': bias_dropout_fusion, + 'apply_rope_fusion': apply_rope_fusion, + 'recompute_granularity': recompute_granularity, + 'recompute_method': recompute_method, + 'recompute_num_layers': recompute_num_layers, + 'distribute_saved_activations': False, # not currently used in NeMo + 'fp8': None, + 'rotary_interleaved': rotary_interleaved, + 'deallocate_pipeline_outputs': True, + } + + # populate the transformer config dict + for field in fields(TransformerConfig): + # config mapping has second highest priority + if field.name in config_mapping: + transformer_config_dict[field.name] = config_mapping[field.name] + # then config + elif field.name in cfg: + transformer_config_dict[field.name] = cfg[field.name] + # then model parallel config + elif field in fields(model_parallel_config): + transformer_config_dict[field.name] = getattr(model_parallel_config, field.name) + else: + logging.warning( + f"The model: {self} does not have field.name: {field.name} in its cfg. " + f"Add this key to cfg or config_mapping to make to make it configurable." + ) + + transformer_config = TransformerConfig(**transformer_config_dict) + + for key, value in model_specific_configs.items(): + setattr(transformer_config, key, value) + + # pass mcore customization configs directly to mcore + mcore_customization_config_dict = model_cfg.get('mcore_customization_config', {}) + for key, value in mcore_customization_config_dict.items(): + setattr(transformer_config, key, value) + + return transformer_config diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 7308d3db3f91..4ded9a42db4f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -484,7 +484,7 @@ def build_transformer_config(self) -> TransformerConfig: activation = self.cfg.get('activation', 'gelu') gated_linear_unit = activation.endswith('glu') # TODO: need to check which activation functions are supported in mcore - activation_func = activation_to_func(activation) + activation_func = activation_to_func(activation, openai_gelu=self.cfg.get("openai_gelu", False)) normalization = self.cfg.get('normalization', 'LayerNorm') diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py index c00df5de1a98..a989ff3f606c 100644 --- a/nemo/collections/nlp/parts/utils_funcs.py +++ b/nemo/collections/nlp/parts/utils_funcs.py @@ -34,14 +34,14 @@ from sklearn.metrics import classification_report, confusion_matrix from torch import Tensor -from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu +from nemo.collections.nlp.modules.common.megatron.utils import ApproxGELUActivation, erf_gelu from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu as openai_gelu_func from nemo.collections.nlp.modules.common.megatron.utils import squared_relu from nemo.utils import logging def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool] = None) -> torch.dtype: - """ Mapping from PTL precision types to corresponding PyTorch parameter datatype.""" + """Mapping from PTL precision types to corresponding PyTorch parameter datatype.""" if megatron_amp_O2 is not None and megatron_amp_O2 is False: return torch.float32 @@ -56,12 +56,12 @@ def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Opti def list2str(l: List[int]) -> str: - """ Converts list to a string""" + """Converts list to a string""" return ' '.join([str(x) for x in l]) def tensor2list(tensor: Tensor) -> List[Union[int, float]]: - """ Converts tensor to a list """ + """Converts tensor to a list""" return tensor.detach().cpu().tolist() @@ -168,13 +168,13 @@ def get_last_rank(): def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bool = False) -> Callable: - """ Converts an activation function represented as a string to a function. + """Converts an activation function represented as a string to a function. Args: activation (str): string representation of an activation function, typically gotten from the model config. openai_gelu (bool): whether to use the OpenAI GELU implementation. Used with HF compatibility. onnx_safe (bool): whether to use the ONNX-compatible implementation of GELU. - + Returns: Callable: the activation function. """ @@ -188,6 +188,7 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo 'fast-geglu', 'fast-swiglu', 'fast-reglu', + 'approx-gelu', ] if activation not in supported_activations: @@ -208,6 +209,8 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo activation_func = F.silu elif activation == 'squared-relu': activation_func = squared_relu + elif activation == 'approx-gelu': + activation_func = ApproxGELUActivation return activation_func diff --git a/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py new file mode 100644 index 000000000000..690fa74abccd --- /dev/null +++ b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py @@ -0,0 +1,248 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Usage example: + torchrun --nproc-per-node=1 /opt/NeMo/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py \ + --input_name_or_path=openai/clip-vit-large-patch14 \ + --output_path=openai_clip.nemo \ + --hparams_file=/opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml + +Additionally, provide a NeMo hparams file with the correct model architecture arguments. Refer to examples/multimodal/foundation/clip/conf/megatron_clip_config.yaml. + +After conversion, you can verify with the following command: + + wget https://upload.wikimedia.org/wikipedia/commons/0/0f/1665_Girl_with_a_Pearl_Earring.jpg + torchrun --nproc-per-node=1 /opt/NeMo/examples/multimodal/vision_language_foundation/clip/megatron_clip_infer.py \ + model.restore_from_path=./openai_clip.nemo \ + image_path=./1665_Girl_with_a_Pearl_Earring.jpg \ + texts='["a dog", "a boy", "a girl"]' + +It should generate a high probability for "a girl" tag, e.g. +Given image's CLIP text probability: [('a dog', 0.0049710185), ('a boy', 0.002258187), ('a girl', 0.99277073)] + +""" + +import os +from argparse import ArgumentParser + +import torch +from omegaconf import OmegaConf +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.trainer.trainer import Trainer +from transformers import CLIPModel + +from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel +from nemo.utils import AppState, logging +from nemo.utils.distributed import initialize_distributed + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + + +def get_args(): + parser = ArgumentParser() + parser.add_argument("--input_name_or_path", type=str, default="openai/clip-vit-base-patch32") + + parser.add_argument( + "--hparams_file", + type=str, + default=None, + required=True, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + + parser.add_argument("--gpus_per_node", type=int, required=False, default=1) + parser.add_argument("--tensor_model_parallel_size", type=int, required=False, default=1) + parser.add_argument("--pipeline_model_parallel_size", type=int, required=False, default=1) + parser.add_argument( + "--pipeline_model_parallel_split_rank", + type=int, + required=False, + default=None, + help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.", + ) + parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) + + args = parser.parse_args() + return args + + +def mapping_hf_state_dict(hf_model): + hf_state_dict = hf_model.state_dict() + hf_config = hf_model.config + key_mapping = { + "text_projection.weight": "text_encoder.head.weight", + "visual_projection.weight": "vision_encoder.head.weight", + } + + layer_mapping = { + ".layer_norm1.weight": ".self_attention.linear_qkv.layer_norm_weight", + ".layer_norm1.bias": ".self_attention.linear_qkv.layer_norm_bias", + ".layer_norm2.weight": ".mlp.linear_fc1.layer_norm_weight", + ".layer_norm2.bias": ".mlp.linear_fc1.layer_norm_bias", + ".self_attn.out_proj.weight": ".self_attention.linear_proj.weight", + ".self_attn.out_proj.bias": ".self_attention.linear_proj.bias", + ".mlp.fc1.weight": ".mlp.linear_fc1.weight", + ".mlp.fc1.bias": ".mlp.linear_fc1.bias", + ".mlp.fc2.weight": ".mlp.linear_fc2.weight", + ".mlp.fc2.bias": ".mlp.linear_fc2.bias", + ".pre_layrnorm.weight": ".ln_pre.weight", + ".pre_layrnorm.bias": ".ln_pre.bias", + ".post_layernorm.weight": ".final_layernorm.weight", + ".post_layernorm.bias": ".final_layernorm.bias", + ".embeddings.patch_embedding.weight": ".conv1.weight", + ".embeddings.class_embedding": ".class_token", + ".final_layer_norm.weight": ".final_layernorm.weight", + ".final_layer_norm.bias": ".final_layernorm.bias", + ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight", + "vision_encoder.embeddings.position_embedding.weight": "vision_encoder.position_embeddings.weight", + "text_encoder.embeddings.position_embedding.weight": "text_encoder.embedding.position_embeddings.weight", + } + + nemo_state_dict = {} + for key in hf_state_dict.keys(): + if key.startswith("text_model.encoder.layers"): + key_ = key.replace("text_model.encoder.layers", "text_encoder.decoder.layers") + elif key.startswith("vision_model.encoder.layers"): + key_ = key.replace("vision_model.encoder.layers", "vision_encoder.decoder.layers") + elif key.startswith('vision_model.'): + key_ = key.replace("vision_model.", "vision_encoder.") + elif key.startswith('text_model.'): + key_ = key.replace('text_model.', 'text_encoder.') + else: + key_ = key + for pat in key_mapping: + if key_ == pat: + key_ = key_.replace(pat, key_mapping[pat]) + for pat in layer_mapping: + if key_.endswith(pat): + key_ = key_[: -len(pat)] + layer_mapping[pat] + break + if "vision" in key_: + config = hf_config.vision_config + else: + config = hf_config.text_config + head_num = num_query_groups = config.num_attention_heads + hidden_size = config.hidden_size + head_size = hidden_size // head_num + heads_per_group = head_num // num_query_groups + + if 'q_proj.weight' in key_: + key_k = key.replace('q_proj', 'k_proj') + key_v = key.replace('q_proj', 'v_proj') + key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv') + q_weight, k_weight, v_weight = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v] + + q_weight = q_weight.reshape(head_num, head_size, hidden_size) + k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size) + v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size) + qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device) + for i in range(num_query_groups): + qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :])) + qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :])) + qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :])) + qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + nemo_state_dict[key_new] = qkv_weight + + elif 'q_proj.bias' in key_: + key_k = key.replace('q_proj', 'k_proj') + key_v = key.replace('q_proj', 'v_proj') + key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv') + q_bias, k_bias, v_bias = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v] + + q_bias = q_bias.reshape(head_num, head_size) + k_bias = k_bias.reshape(num_query_groups, head_size) + v_bias = v_bias.reshape(num_query_groups, head_size) + qkv_bias = torch.empty((0, head_size), device=q_bias.device) + for i in range(num_query_groups): + qkv_bias = torch.cat((qkv_bias, q_bias[i * heads_per_group : (i + 1) * heads_per_group, :])) + qkv_bias = torch.cat((qkv_bias, k_bias[i : i + 1, :])) + qkv_bias = torch.cat((qkv_bias, v_bias[i : i + 1, :])) + qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)]) + nemo_state_dict[key_new] = qkv_bias + elif not ('k_proj' in key_ or 'v_proj' in key_ or 'position_ids' in key_): + nemo_state_dict[key_] = hf_state_dict[key] + + nemo_state_dict["vision_encoder.class_token"] = nemo_state_dict["vision_encoder.class_token"].reshape(1, 1, -1) + + return nemo_state_dict + + +def convert(local_rank, rank, world_size, args): + app_state = AppState() + app_state.data_parallel_rank = 0 + num_nodes = world_size // args.gpus_per_node + trainer = Trainer( + devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()] + ) + + app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = args.tensor_model_parallel_size + + # no use atm, use to split ranks in encoder/decoder models. + if args.pipeline_model_parallel_size > 1 and args.model_type in []: + if args.pipeline_model_parallel_split_rank is not None: + app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank + else: + if args.pipeline_model_parallel_size % 2 != 0: + raise ValueError( + f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified." + ) + else: + # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers. + app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2 + else: + app_state.pipeline_model_parallel_split_rank = None + + app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size + + parallel_state.initialize_model_parallel( + tensor_model_parallel_size=app_state.tensor_model_parallel_size, + pipeline_model_parallel_size=app_state.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank, + ) + + app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank() + app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank() + + cfg = OmegaConf.load(args.hparams_file) + cfg.model.mcore_gpt = True + cfg.model.transformer_engine = True + cfg.model.text.position_embedding_type = "learned_absolute" + cfg.model.vision.position_embedding_type = "learned_absolute" + + model = MegatronCLIPModel(cfg.model, trainer) + + hf_model = CLIPModel.from_pretrained(args.input_name_or_path) + state_dict = mapping_hf_state_dict(hf_model) + + model.model.load_state_dict(state_dict, strict=False) + + model.save_to(args.output_path) + + logging.info(f'NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + args = get_args() + local_rank, rank, world_size = initialize_distributed(args) + convert(local_rank, rank, world_size, args) diff --git a/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py new file mode 100644 index 000000000000..97a9d557f78b --- /dev/null +++ b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py @@ -0,0 +1,380 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Requires HF transformers updated to support Gemma Models + python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_hf_to_nemo.py \ + --input_name_or_path /path/to/gemma/checkpoints/hf/7b \ + --output_path /path/to/gemma-7b.nemo \ + --tokenizer_path /path/to/tokenizer.model +""" + +import os +from argparse import ArgumentParser + +import torch +from omegaconf import OmegaConf +from transformers import AutoModel, AutoProcessor + +from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils import logging + + +def create_rename_keys(num_hidden_layers): + rename_keys = [] + for i in range(num_hidden_layers): + rename_keys.extend( + [ + ( + f"text_model.encoder.layers.{i}.self_attn.k_proj.weight", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.weight", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.k_proj.bias", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.bias", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.q_proj.weight", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.weight", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.q_proj.bias", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.bias", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.v_proj.weight", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.weight", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.v_proj.bias", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.bias", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.out_proj.weight", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.weight", + ), + ( + f"text_model.encoder.layers.{i}.self_attn.out_proj.bias", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.bias", + ), + ( + f"text_model.encoder.layers.{i}.layer_norm1.weight", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight", + ), + ( + f"text_model.encoder.layers.{i}.layer_norm1.bias", + f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias", + ), + ( + f"text_model.encoder.layers.{i}.mlp.fc1.weight", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.weight", + ), + ( + f"text_model.encoder.layers.{i}.mlp.fc1.bias", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.bias", + ), + ( + f"text_model.encoder.layers.{i}.mlp.fc2.weight", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.weight", + ), + ( + f"text_model.encoder.layers.{i}.mlp.fc2.bias", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.bias", + ), + ( + f"text_model.encoder.layers.{i}.layer_norm2.weight", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight", + ), + ( + f"text_model.encoder.layers.{i}.layer_norm2.bias", + f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.weight", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.bias", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.weight", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.bias", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.weight", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.bias", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.weight", + ), + ( + f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.bias", + ), + ( + f"vision_model.encoder.layers.{i}.layer_norm1.weight", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight", + ), + ( + f"vision_model.encoder.layers.{i}.layer_norm1.bias", + f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias", + ), + ( + f"vision_model.encoder.layers.{i}.mlp.fc1.weight", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.weight", + ), + ( + f"vision_model.encoder.layers.{i}.mlp.fc1.bias", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.bias", + ), + ( + f"vision_model.encoder.layers.{i}.mlp.fc2.weight", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.weight", + ), + ( + f"vision_model.encoder.layers.{i}.mlp.fc2.bias", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.bias", + ), + ( + f"vision_model.encoder.layers.{i}.layer_norm2.weight", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight", + ), + ( + f"vision_model.encoder.layers.{i}.layer_norm2.bias", + f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias", + ), + ] + ) + + rename_keys.extend( + [ + ("logit_scale", "model.logit_scale"), + ("logit_bias", "model.logit_bias"), + ("vision_model.embeddings.patch_embedding.weight", "model.vision_encoder.conv1.weight"), + ("vision_model.embeddings.patch_embedding.bias", "model.vision_encoder.conv1.bias"), + ("vision_model.embeddings.position_embedding.weight", "model.vision_encoder.position_embeddings.weight"), + ("vision_model.post_layernorm.weight", "model.vision_encoder.final_layernorm.weight"), + ("vision_model.post_layernorm.bias", "model.vision_encoder.final_layernorm.bias"), + ("vision_model.head.probe", "model.vision_encoder.head.probe"), + ( + "vision_model.head.attention.in_proj_weight", + "model.vision_encoder.head.cross_attention.linear_qkv.weight", + ), + ("vision_model.head.attention.in_proj_bias", "model.vision_encoder.head.cross_attention.linear_qkv.bias"), + ( + "vision_model.head.attention.out_proj.weight", + "model.vision_encoder.head.cross_attention.linear_proj.weight", + ), + ( + "vision_model.head.attention.out_proj.bias", + "model.vision_encoder.head.cross_attention.linear_proj.bias", + ), + ("vision_model.head.layernorm.weight", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_weight"), + ("vision_model.head.layernorm.bias", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_bias"), + ("vision_model.head.mlp.fc1.weight", "model.vision_encoder.head.mlp.linear_fc1.weight"), + ("vision_model.head.mlp.fc1.bias", "model.vision_encoder.head.mlp.linear_fc1.bias"), + ("vision_model.head.mlp.fc2.weight", "model.vision_encoder.head.mlp.linear_fc2.weight"), + ("vision_model.head.mlp.fc2.bias", "model.vision_encoder.head.mlp.linear_fc2.bias"), + ("text_model.embeddings.token_embedding.weight", "model.text_encoder.embedding.word_embeddings.weight"), + ( + "text_model.embeddings.position_embedding.weight", + "model.text_encoder.embedding.position_embeddings.weight", + ), + ("text_model.final_layer_norm.weight", "model.text_encoder.final_layernorm.weight"), + ("text_model.final_layer_norm.bias", "model.text_encoder.final_layernorm.bias"), + ("text_model.head.weight", "model.text_encoder.head.weight"), + ("text_model.head.bias", "model.text_encoder.head.bias"), + ] + ) + + return rename_keys + + +def rename_model_keys(model_state_dict, rename_keys): + """ + Rename keys in the model's state dictionary based on the provided mappings. + + Parameters: + model_state_dict (dict): The state dictionary of the model. + rename_keys (list): A list of tuples with the mapping (old_key, new_key). + + Returns: + dict: A new state dictionary with updated key names. + """ + + # Create a new state dictionary with updated key names + new_state_dict = {} + + # Track keys from the original state dict to ensure all are processed + remaining_keys = set(model_state_dict.keys()) + + # Iterate over the rename mappings + for old_key, new_key in rename_keys: + if old_key in model_state_dict: + # Rename the key and remove it from the tracking set + new_state_dict[new_key] = model_state_dict[old_key] + remaining_keys.remove(old_key) + + # Check if any keys were not converted from old to new + for old_key in remaining_keys: + print(f"Warning: Key '{old_key}' was not converted.") + + return new_state_dict + + +def adjust_tensor_shapes(model, nemo_state_dict): + """ + Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure. + + Parameters: + nemo_state_dict (dict): The state dictionary of the model. + + Returns: + dict: The updated state dictionary with modified tensor shapes for compatibility. + """ + model_config = model.cfg + + # Note: For 'key' and 'value' weight and biases, NeMo uses a consolidated tensor 'query_key_value'. + for key_ in list(nemo_state_dict.keys()): + if "vision" in key_: + config = model_config["vision"] + else: + config = model_config["text"] + num_query_groups = head_num = config["num_attention_heads"] + hidden_size = config["hidden_size"] + head_size = hidden_size // head_num + heads_per_group = head_num // num_query_groups + if "bias" in key_: + hidden_size = 1 + + if 'head.cross_attention.linear_qkv.' in key_: + key_q = key_.replace('linear_qkv', 'linear_q') + key_kv = key_.replace('linear_qkv', 'linear_kv') + q_weight, k_weight, v_weight = nemo_state_dict[key_].chunk(3) + k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size) + v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size) + kv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device) + for i in range(num_query_groups): + kv_weight = torch.cat((kv_weight, k_weight[i : i + 1, :, :])) + kv_weight = torch.cat((kv_weight, v_weight[i : i + 1, :, :])) + kv_weight = kv_weight.reshape([head_size * 2 * num_query_groups, hidden_size]) + if "bias" in key_: + kv_weight = kv_weight.squeeze(-1) + nemo_state_dict[key_q] = q_weight + nemo_state_dict[key_kv] = kv_weight + del nemo_state_dict[key_] + + if 'self_attention.linear_q.' in key_: + key_q = key_ + key_k = key_.replace('linear_q', 'linear_k') + key_v = key_.replace('linear_q', 'linear_v') + key_qkv = key_.replace('linear_q', 'linear_qkv') + + # [(head_num + 2 * num_query_groups) * head_size, hidden_size] + # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size] + q_weight, k_weight, v_weight = nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v] + q_weight = q_weight.reshape(head_num, head_size, hidden_size) + k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size) + v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size) + + qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device) + for i in range(num_query_groups): + qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :])) + qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :])) + qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :])) + qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + if "bias" in key_: + qkv_weight = qkv_weight.squeeze(-1) + nemo_state_dict[key_qkv] = qkv_weight + del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v] + + return nemo_state_dict + + +def adjust_nemo_config(model_config, ref_config): + model_config["encoder_seq_length"] = ref_config["max_position_embeddings"] + model_config["num_layers"] = ref_config["num_hidden_layers"] + model_config["ffn_hidden_size"] = ref_config["intermediate_size"] + model_config["hidden_size"] = ref_config["hidden_size"] + model_config["num_attention_heads"] = ref_config["num_attention_heads"] + model_config["num_query_groups"] = ref_config["num_key_value_heads"] + model_config["kv_channels"] = ref_config["head_dim"] + model_config["layernorm_epsilon"] = ref_config["rms_norm_eps"] + return model_config + + +def get_args(): + parser = ArgumentParser() + parser.add_argument("--input_name_or_path", type=str) + parser.add_argument("--tokenizer_path", type=str) + parser.add_argument( + "--hparams_file", + type=str, + default=os.path.join( + os.path.dirname(__file__), + '../../examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml', + ), + required=False, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", + ) + parser.add_argument("--output_path", type=str, default=None, help="Path to output .nemo file.") + parser.add_argument( + "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weight saved" + ) + + args = parser.parse_args() + return args + + +def convert(args): + logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`") + hf_model = AutoModel.from_pretrained(args.input_name_or_path) + # hf_processor = AutoProcessor.from_pretrained(args.input_name_or_path) + logging.info("HF Model loading done.") + + nemo_config = OmegaConf.load(args.hparams_file) + + nemo_config.trainer["precision"] = args.precision + trainer = MegatronTrainerBuilder(nemo_config).create_trainer() + model = MegatronCLIPModel(nemo_config.model, trainer) + + assert nemo_config.model.text.num_layers == nemo_config.model.vision.num_layers + rename_keys = create_rename_keys(nemo_config.model.text.num_layers) + old_state_dict = hf_model.state_dict() + new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys) + + nemo_state_dict = adjust_tensor_shapes(model, new_state_dict) + model.load_state_dict(nemo_state_dict, strict=False) + + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) + model.save_to(args.output_path) + logging.info(f'NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + args = get_args() + convert(args) From b4fe4a595575614d8c054ea28cecc02c90f946b6 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:23:28 -0700 Subject: [PATCH 07/13] Add REST API to deploy module (#9539) * Add REST API and FastAPI to deploy module Signed-off-by: Abhishree * Add NemoQuery and requirements Signed-off-by: Abhishree * Edit path for config.json Signed-off-by: Abhishree * Add modifications for REST API for the correct functionality Move service dir under deploy Use NeMoQueryLLM instead of NemoQuery Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply isort and black reformatting Signed-off-by: pre-commit-ci[bot] * Change default port for REST Service Change default port for REST service as Triton server also used the same port as default. Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> * Apply isort and black reformatting Signed-off-by: athitten --------- Signed-off-by: Abhishree Signed-off-by: pre-commit-ci[bot] Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Signed-off-by: athitten Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] Co-authored-by: athitten --- nemo/deploy/service/__init__.py | 14 +++++ nemo/deploy/service/config.json | 5 ++ nemo/deploy/service/rest_model_api.py | 87 +++++++++++++++++++++++++++ requirements/requirements_infer.txt | 4 +- scripts/deploy/nlp/deploy_triton.py | 30 ++++++++- 5 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 nemo/deploy/service/__init__.py create mode 100644 nemo/deploy/service/config.json create mode 100644 nemo/deploy/service/rest_model_api.py diff --git a/nemo/deploy/service/__init__.py b/nemo/deploy/service/__init__.py new file mode 100644 index 000000000000..0349454da9e1 --- /dev/null +++ b/nemo/deploy/service/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .rest_model_api import app diff --git a/nemo/deploy/service/config.json b/nemo/deploy/service/config.json new file mode 100644 index 000000000000..d3b3440dd97b --- /dev/null +++ b/nemo/deploy/service/config.json @@ -0,0 +1,5 @@ +{ + "triton_service_port": 8000, + "triton_service_ip": "0.0.0.0", + "triton_request_timeout": 60 + } \ No newline at end of file diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py new file mode 100644 index 000000000000..5c49370fd45f --- /dev/null +++ b/nemo/deploy/service/rest_model_api.py @@ -0,0 +1,87 @@ +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +from pathlib import Path + +from fastapi import FastAPI +from pydantic import BaseModel +from pydantic_settings import BaseSettings + +from nemo.deploy.nlp import NemoQueryLLM + + +class TritonSettings(BaseSettings): + _triton_service_port: int + _triton_service_ip: str + _triton_request_timeout: str + + def __init__(self): + super(TritonSettings, self).__init__() + try: + with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config: + config_json = json.load(config) + self._triton_service_port = config_json["triton_service_port"] + self._triton_service_ip = config_json["triton_service_ip"] + self._triton_request_timeout = config_json["triton_request_timeout"] + except Exception as error: + print("An exception occurred:", error) + return + + @property + def triton_service_port(self): + return self._triton_service_port + + @property + def triton_service_ip(self): + return self._triton_service_ip + + @property + def triton_request_timeout(self): + return self._triton_request_timeout + + +app = FastAPI() +triton_settings = TritonSettings() + + +class CompletionRequest(BaseModel): + model: str + prompt: str + max_tokens: int = 512 + temperature: float = 1.0 + top_p: float = 0.0 + n: int = 1 + stream: bool = False + stop: str | None = None + frequency_penalty: float = 1.0 + + +@app.post("/v1/completions/") +def completions_v1(request: CompletionRequest): + try: + url = triton_settings.triton_service_ip + ":" + str(triton_settings.triton_service_port) + nq = NemoQueryLLM(url=url, model_name=request.model) + output = nq.query_llm( + prompts=[request.prompt], + max_output_len=request.max_tokens, + top_k=request.n, + top_p=request.top_p, + temperature=request.temperature, + init_timeout=triton_settings.triton_request_timeout, + ) + return { + "output": output[0][0], + } + except Exception as error: + print("An exception occurred:", error) + return {"error": "An exception occurred"} diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt index c18f4e81ade3..5380398c278b 100644 --- a/requirements/requirements_infer.txt +++ b/requirements/requirements_infer.txt @@ -1,4 +1,6 @@ +fastapi nvidia-pytriton +pydantic-settings tensorstore==0.1.45 +uvicorn zarr - diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 7173c64c7438..a306231bcd61 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -18,6 +18,8 @@ import sys from pathlib import Path +import uvicorn + from nemo.deploy import DeployPyTriton LOGGER = logging.getLogger("NeMo") @@ -170,6 +172,17 @@ def get_args(argv): choices=['TensorRT-LLM', 'In-Framework'], help="Different options to deploy nemo model.", ) + parser.add_argument( + "-srs", + "--start_rest_service", + default="False", + type=str, + help="Starts the REST service for OpenAI API support", + ) + parser.add_argument( + "-sha", "--service_http_address", default="0.0.0.0", type=str, help="HTTP address for the REST Service" + ) + parser.add_argument("-sp", "--service_port", default=8080, type=int, help="Port for the REST Service") parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") args = parser.parse_args(argv) return args @@ -224,6 +237,11 @@ def get_trtllm_deployable(args): "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids)) ) + if args.start_rest_service: + if args.service_port == args.triton_port: + logging.error("REST service port and Triton server port cannot use the same port.") + return + trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt, @@ -331,11 +349,21 @@ def nemo_deploy(argv): try: LOGGER.info("Model serving on Triton is will be started.") + if args.start_rest_service == "True": + try: + LOGGER.info("REST service will be started.") + uvicorn.run( + 'nemo.deploy.service.rest_model_api:app', + host=args.service_http_address, + port=args.service_port, + reload=True, + ) + except Exception as error: + logging.error("Error message has occurred during REST service start. Error message: " + str(error)) nm.serve() except Exception as error: LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) return - LOGGER.info("Model serving will be stopped.") nm.stop() From 4dc63e751033b0ce4f0c4b2967bdd2dbb0058d31 Mon Sep 17 00:00:00 2001 From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:37:46 -0700 Subject: [PATCH 08/13] Mistral + Mixtral Support for NeVa (#9459) * mistral template support Signed-off-by: paul-gibbons * get_specs neva fix Signed-off-by: paul-gibbons * mistral update Signed-off-by: paul-gibbons * fixed mistral tokenization Signed-off-by: paul-gibbons * text_gen_strategy add mistral support Signed-off-by: paul-gibbons * mistral text_gen fix Signed-off-by: paul-gibbons * Cleaning up neva config Signed-off-by: paul-gibbons * fix llama_2 default text_gen_strategy Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * fix forward() to account for new embedding optimization in MCore Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons --------- Signed-off-by: paul-gibbons Signed-off-by: paul-gibbons Co-authored-by: paul-gibbons --- .../multimodal/data/neva/conversation.py | 28 ++++++++++++-- .../multimodal/data/neva/neva_dataset.py | 34 ++++++++++++++--- .../models/multimodal_llm/neva/neva_model.py | 38 ++++++++++++++++--- nemo/collections/multimodal/parts/utils.py | 4 +- .../common/text_generation_strategy.py | 21 ++++++++++ 5 files changed, 109 insertions(+), 16 deletions(-) diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 43b1977aa993..10a6c9e7283d 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -43,6 +43,7 @@ class SeparatorStyle(Enum): PLAIN = auto() LLAMA_2 = auto() LLAMA_3 = auto() + MISTRAL = auto() NVGPT = auto() @@ -94,11 +95,15 @@ def get_prompt(self): ret += " " else: ret += role + ":" - elif self.sep_style == SeparatorStyle.LLAMA_2: - wrap_sys = lambda msg: f"<>\n{msg}\n<>\n\n" + elif self.sep_style == SeparatorStyle.LLAMA_2 or self.sep_style == SeparatorStyle.MISTRAL: + if self.sep_style == SeparatorStyle.LLAMA_2: + wrap_sys = lambda msg: f"<>\n{msg}\n<>\n\n" + else: + wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "") wrap_inst = lambda msg: f"[INST] {msg} [/INST]" ret = "" - + if self.sep_style == SeparatorStyle.MISTRAL: + ret += DEFAULT_BOS_TOKEN for i, (role, message) in enumerate(messages): if i == 0: assert message, "first message should not be none" @@ -112,7 +117,10 @@ def get_prompt(self): message = wrap_inst(message) ret += self.sep + " " + message else: - ret += " " + message + " " + self.sep2 + if self.sep_style == SeparatorStyle.LLAMA_2: + ret += " " + message + " " + self.sep2 + else: + ret += message + self.sep2 else: ret += "" ret = ret.lstrip(self.sep) @@ -449,6 +457,17 @@ def dict(self): version="v1_mmtag", ) +conv_mistral = Conversation( + system="", + roles=("USER", "ASSISTANT"), + version="mistral", + messages=(), + offset=0, + sep_style=SeparatorStyle.MISTRAL, + sep="", + sep2=DEFAULT_EOS_TOKEN, +) + default_conversation = conv_vicuna_v1 conv_templates = { "default": conv_vicuna_v0, @@ -466,6 +485,7 @@ def dict(self): "nvgpt": conv_nvgpt, "nv_steerlm": conv_nvgpt, "nv_dpo": conv_nv_dpo, + "mistral": conv_mistral, } if __name__ == "__main__": diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 86d45ded54cf..7eef677e13a8 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -426,6 +426,7 @@ def preprocess_llama_2( sources: dict, tokenizer, cfg, + is_mistral: bool = False, ) -> Dict: """ Preprocesses sources for the LLaMA 2 model configuration. @@ -442,7 +443,10 @@ def preprocess_llama_2( - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model. This includes tokens, labels, and any special processing as defined in the configuration. """ - conv = conversation_lib.conv_llava_llama_2.copy() + if is_mistral: + conv = conversation_lib.conv_mistral.copy() + else: + conv = conversation_lib.conv_llava_llama_2.copy() roles = {"human": conv.roles[0], "gpt": conv.roles[1]} # Apply prompt templates @@ -477,7 +481,10 @@ def preprocess_llama_2( labels = tokens.clone().detach() # Mask labels - sep = "[/INST] " + if is_mistral: + sep = "[/INST]" + else: + sep = "[/INST] " for conversation, target in zip(conversations, labels): rounds = conversation.split(conv.sep2) cur_len = 0 @@ -492,18 +499,23 @@ def preprocess_llama_2( parts[0] += sep round_len = len(tokenizer.text_to_ids(rou + conv.sep2)) - instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2 + + if is_mistral: + instruction_len = len(tokenizer.text_to_ids(parts[0])) - 1 + else: + instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2 + if i > 0: round_len -= 1 # Remove extra token added by sp tokenizer else: instruction_len += 1 target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - cur_len += round_len target[cur_len:] = IGNORE_INDEX # Check if masking working correctly - # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())]) + # masking_test =[x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())] + # print(masking_test) if add_extra_token: tokens = tokens[:, :-1].contiguous() @@ -990,7 +1002,10 @@ def expand2square(pil_img, background_color): result.paste(pil_img, ((height - width) // 2, 0)) return result - frames = expand2square(frames, tuple(int(x * 255) for x in self.processor.image_mean)) + frames = [ + expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean)) + for frame in frames + ] frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] else: frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] @@ -1057,6 +1072,13 @@ def expand2square(pil_img, background_color): self.tokenizer, self.multimodal_cfg, ) + elif self.conv_template == "mistral": + data_dict = preprocess_llama_2( + sources, + self.tokenizer, + self.multimodal_cfg, + is_mistral=True, + ) elif self.conv_template == "plain": data_dict = preprocess_plain( sources, diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index cce40da45725..376237e89ecc 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -75,7 +75,7 @@ HAVE_APEX = False try: - from megatron.core import InferenceParams, dist_checkpointing, parallel_state + from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint @@ -154,10 +154,34 @@ def set_media(self, media): self.media = media def forward(self, input_ids, **kwargs): - media = self.media # avoid change the signature of embedding forward function + media = self.media # avoid changing the signature of embedding forward function + + # TODO: Refactor replace_media_embedding to account for MCore's embedding communication optimization + # https://github.com/NVIDIA/Megatron-LM/commit/ee423e7 changes the way we handle embeddings with sequence parallelism + # When using reduce_scatter_embeddings, word_embedding_tensor is now in the following shape: [sequence/tp, batch_size, hidden_size] + # replace_media_embedding currently expects [batch_size, sequence, hidden_size] + + # Check if reduce_scatter_embeddings is enabled in the embedding forward function + apply_reduce_scatter = getattr(self, 'reduce_scatter_embeddings', False) + + # Set reduce_scatter_embeddings to false to keep words_embedding's + # tensor dimesion the same for replace_media_embedding + if apply_reduce_scatter: + self.reduce_scatter_embeddings = False + words_embeddings = super().forward(input_ids, **kwargs) + words_embeddings = self.replace_media_embeddings(input_ids, words_embeddings, media) - return self.replace_media_embeddings(input_ids, words_embeddings, media) + # Scatter embeddings back to each TP rank if reduce_scatter_embeddings is enabled + if apply_reduce_scatter: + words_embeddings = self._apply_reduce_scatter(words_embeddings) + self.reduce_scatter_embeddings = True + + return words_embeddings + + def _apply_reduce_scatter(self, embeddings): + embeddings = embeddings.transpose(0, 1).contiguous() + return tensor_parallel.mappings.scatter_to_sequence_parallel_region(embeddings) def encode_vision_x(self, vision_x: torch.Tensor): """ @@ -193,7 +217,6 @@ def encode_vision_x(self, vision_x: torch.Tensor): def replace_media_embeddings(self, input_ids, inputs_embeds, media): if media is None: return inputs_embeds - batch_size, sequence_length, hidden_size = inputs_embeds.shape # calculate media features without gradients @@ -550,7 +573,12 @@ def dummy(): media_end_id=media_end_id, mcore_gpt=self.mcore_gpt, config=self.transformer_config, - transformer_layer_spec=get_specs(self.spec_name), + transformer_layer_spec=get_specs( + self.spec_name, + self.transformer_config.num_moe_experts, + self.transformer_config.moe_grouped_gemm, + self.transformer_engine, + ), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index b6dee33d24f3..7eb72b38d0f0 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -135,8 +135,10 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None): # distributed checkpointing if state_dict is None and sharded_state_dict is not None: + is_dist_ckpt = True checkpoint = dict(state_dict=sharded_state_dict) + tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt) tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0] assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.' @@ -501,7 +503,7 @@ def expand2square(pil_img, background_color): result.paste(pil_img, ((height - width) // 2, 0)) return result - frames = expand2square(frames, tuple(int(x * 255) for x in processor.image_mean)) + frames = [expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean)) for frame in frames] frames = processor.preprocess(frames, return_tensors='pt')['pixel_values'] else: frames = processor.preprocess(frames, return_tensors='pt')['pixel_values'] diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index f51d53ba5944..8f8fe313a5e3 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -508,6 +508,27 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "mistral": + record = { + 'conversations': [ + { + 'from': 'human', + 'value': prompt, + }, + { + 'from': 'gpt', + 'value': '', + }, + ], + } + for turn in record['conversations']: + if turn.get('value') is not None: + turn['value'] = re.sub('', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value']) + list_data_dict.append(record) + sources = preprocess_multimodal( + copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents + ) # HARDCODED FOR NOW + data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True) elif multimodal_cfg["conv_template"] == "v1": record = { 'conversations': [ From 38af139d8f2d3377201815d743c3c0daa05748b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 8 Jul 2024 18:49:09 +0200 Subject: [PATCH 09/13] ci: Timeout per step, not job (#9635) Signed-off-by: Oliver Koenig --- .github/workflows/_test_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 5956a23bdd67..0dbb1d50ee52 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -36,7 +36,6 @@ on: jobs: main: runs-on: ${{ inputs.RUNNER }} - timeout-minutes: ${{ inputs.TIMEOUT }} outputs: conclusion: ${{ steps.main.conclusion }} log: ${{ steps.main.outputs.log }} @@ -54,6 +53,7 @@ jobs: uses: actions/checkout@v4 - id: main name: Run main script + timeout-minutes: ${{ inputs.TIMEOUT }} run: | set +e ( From aa397d7677b164abbd6138b8980b3d5019b399f7 Mon Sep 17 00:00:00 2001 From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:52:21 -0700 Subject: [PATCH 10/13] Adding support for mcore generate (#9566) * Adding support for mcore generate * Apply isort and black reformatting Signed-off-by: shanmugamr1992 * adding support * Apply isort and black reformatting Signed-off-by: shanmugamr1992 * adding support --------- Signed-off-by: shanmugamr1992 Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: shanmugamr Co-authored-by: shanmugamr1992 --- .../conf/megatron_gpt_inference.yaml | 1 + .../megatron_gpt_inference_batch_mcore.yaml | 29 +++ .../language_modeling/megatron_gpt_eval.py | 3 + .../megatron_gpt_mcore_batch_eval.py | 222 ++++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml create mode 100644 examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml index ce8311daf95c..056f9eb9c6ec 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml @@ -1,3 +1,4 @@ +# NOTE : This config and megatron_gpt_eval.py will be deprecated soon. Use megatron_gpt_inference_batch_mcore.yaml inference: greedy: False # Whether or not to use sampling ; use greedy decoding otherwise top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml new file mode 100644 index 000000000000..1b34a8b5abc3 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml @@ -0,0 +1,29 @@ +common_inference_params: + top_k: 1 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.0 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + return_log_probs: False # whether return the log prob for the sampled tokens + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 16 # 16, 32, or bf16 + use_distributed_sampler: False + +tensor_model_parallel_size: -1 +pipeline_model_parallel_size: -1 +inference_batch_times_seq_len_threshold: 1000 # If batch_size * sequence-length is smaller than this threshold we will not use pipelining, otherwise we will. +max_batch_size: 4 # Input prompts are batched using max_batch_size and sent to inference + +megatron_amp_O2: False # Enable O2-level automatic mixed precision to save memory +gpt_model_file: null # GPT nemo file path +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading +prompts: # prompts for GPT inference + - "Q: How are you?" + - "Q: How big is the universe?" diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 362a2ae3e298..b9b0d2973094 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -31,6 +31,7 @@ from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector from nemo.core.config import hydra_runner +from nemo.utils import logging from nemo.utils.app_state import AppState from nemo.utils.model_utils import inject_model_parallel_rank @@ -168,6 +169,7 @@ def remove_padded_prompts(response, nb_paddings): def load_model_from_config(trainer, cfg): + if cfg.gpt_model_file is not None: if ( cfg.tensor_model_parallel_size < 0 @@ -306,6 +308,7 @@ def round_to_mult(n, mult=8): def main(cfg) -> None: callbacks = [] + logging.warning("This file will be depreacted soon. Use the megatron_gpt_mcore_batch_eval.py file instead.") # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar: callbacks.append(CustomProgressBar()) diff --git a/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py new file mode 100644 index 000000000000..988a5f8588ff --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py @@ -0,0 +1,222 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import os +from argparse import Namespace + +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.engines.mcore_engine import MCoreEngine +from megatron.core.inference.inference_model_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( + SimpleTextGenerationController, +) +from omegaconf import OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils.app_state import AppState +from nemo.utils.model_utils import inject_model_parallel_rank + +""" +This is the script to run GPT text generation in batch mode using Megatron Core Generate function. +""" + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_inference_batch_mcore") +def main(cfg) -> None: + callbacks = [] + # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks + if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar: + callbacks.append(CustomProgressBar()) + # trainer required for restoring model parallel models + trainer = Trainer( + strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), + **cfg.trainer, + callbacks=callbacks, + ) + + if cfg.gpt_model_file is not None: + if ( + cfg.tensor_model_parallel_size < 0 + or cfg.pipeline_model_parallel_size < 0 + or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 + ): + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + model_config = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + + # with dist checkpointing we don't need to set this + if not model_config.get('mcore_gpt', False): + with open_dict(cfg): + cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) + cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) + cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) + + assert ( + cfg.trainer.devices * cfg.trainer.num_nodes + == cfg.tensor_model_parallel_size + * cfg.pipeline_model_parallel_size + * max(1, cfg.get('expert_model_parallel_size', 1)) + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + if cfg.gpt_model_file: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + + pretrained_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + OmegaConf.set_struct(pretrained_cfg, True) + with open_dict(pretrained_cfg): + pretrained_cfg.sequence_parallel = False + pretrained_cfg.activations_checkpoint_granularity = None + pretrained_cfg.activations_checkpoint_method = None + pretrained_cfg.precision = trainer.precision + pretrained_cfg["use_flash_attention"] = cfg.get("use_flash_attention", False) + pretrained_cfg["apply_rope_fusion"] = False + if pretrained_cfg.get('mcore_gpt', False): + # with dist checkpointing we can use the model parallel config specified by the user + pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size + pretrained_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + pretrained_cfg.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1) + pretrained_cfg.micro_batch_size = 1 + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False + elif trainer.precision in ['bf16', 'bf16-mixed'] and cfg.get('megatron_amp_O2', False): + pretrained_cfg.megatron_amp_O2 = True + model = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + save_restore_connector=save_restore_connector, + map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models + ) + elif cfg.checkpoint_dir: + app_state = AppState() + if ( + cfg.tensor_model_parallel_size > 1 + or cfg.pipeline_model_parallel_size > 1 + or cfg.get('expert_model_parallel_size', 1) > 1 + ): + app_state.model_parallel_size = ( + cfg.tensor_model_parallel_size + * cfg.pipeline_model_parallel_size + * cfg.get('expert_model_parallel_size', 1) + ) + app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + app_state.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1) + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.expert_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, + expert_model_parallel_size_=cfg.get('expert_model_parallel_size', 1), + ) + checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) + # checkpoint_path is a dir in case of distributed checkpointing + if not os.path.isdir(checkpoint_path): + # legacy checkpoint needs model parallel rank injection + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) + model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) + else: + raise ValueError("need at least a nemo file or checkpoint dir") + + model.freeze() + + # Have to turn off activations_checkpoint_method for inference + try: + model.model.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + + args = Namespace + args.inference_batch_times_seq_len_threshold = cfg.inference_batch_times_seq_len_threshold + args.padded_vocab_size = model.padded_vocab_size + args.fp32_residual_connection = model.cfg.fp32_residual_connection + args.hidden_size = model.cfg.hidden_size + args.params_dtype = model.cfg.precision + args.max_batch_size = cfg.max_batch_size + + # We need this wrapper since mcore generate uses tokenizer.detokenize, tokenizer.tokenize to encode and decode prompts + class MCoreTokenizerWrappper: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.eod = tokenizer.eod + self.vocab_size = tokenizer.vocab_size + + def detokenize(self, tokens): + return self.tokenizer.ids_to_text(tokens) + + def tokenize(self, prompt): + return self.tokenizer.text_to_ids(prompt) + + tokenizer = MCoreTokenizerWrappper(model.tokenizer) + + inference_wrapped_model = GPTInferenceWrapper(model.model, args) + text_generation_controller = SimpleTextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer + ) + mcore_engine = MCoreEngine( + text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size + ) + + common_inference_params = CommonInferenceParams( + temperature=cfg.common_inference_params.temperature, + top_k=cfg.common_inference_params.top_k, + top_p=cfg.common_inference_params.top_p, + return_log_probs=cfg.common_inference_params.return_log_probs, + num_tokens_to_generate=cfg.common_inference_params.tokens_to_generate, + ) + + results = mcore_engine.generate( + prompts=OmegaConf.to_container(cfg.prompts), common_inference_params=common_inference_params + ) + + for idx, result in enumerate(results): + print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens': result.generated_tokens, + } + print(result) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter From 66c960ebdec9d22f40a7d43e9b2d38dc4a34ad25 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:31:24 -0400 Subject: [PATCH 11/13] Improve error messaging during trt-llm export (#9638) * fix minor import bug Signed-off-by: Onur Yilmaz * Raise error when number of query groups cannot be splitted by the tps Signed-off-by: Onur Yilmaz * moved the error message to the utils Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz --- nemo/export/trt_llm/converter/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py index a4365a281b49..3768ff4b2844 100644 --- a/nemo/export/trt_llm/converter/utils.py +++ b/nemo/export/trt_llm/converter/utils.py @@ -388,6 +388,16 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t # Split the QKV to separate variables. qkv = np.split(val, [q_num, q_num + 1], axis=2) + + query_groups_shape = qkv[0].shape + if len(query_groups_shape) > 1: + if (query_groups_shape[1] % split_factor) != 0: + raise Exception( + "Number of query groups of the models is {0}. Please select tensor parallelism size " + "that can split the number of query groups to equal number of query matrices in the " + "each GPU.".format(query_groups_shape[1]) + ) + q_split = np.split(qkv[0], split_factor, axis=1) k_split = np.split(qkv[1], split_factor, axis=1) v_split = np.split(qkv[2], split_factor, axis=1) From f79074146563a38ae2a54a5358b8002c66d6499a Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 8 Jul 2024 19:37:20 -0400 Subject: [PATCH 12/13] support lora when kv_channel != hidden_size / num_heads (#9644) Co-authored-by: Ao Tang --- nemo/collections/nlp/parts/peft_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 50c97e349885..726ca33611d7 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -170,7 +170,7 @@ def __init__(self, cfg): elif module == PEFT_MODULE_MAP["dense_module"]: adapter_cfg = self._create_lora_config( - cfg, lora_cfg, cfg.hidden_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig + cfg, lora_cfg, projection_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig ) name_key_to_cfg[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = adapter_cfg name_key_to_mcore_mixins[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = [ From f9c3a8b3a3165c365cc34a6e9d9820414fdb9935 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Tue, 9 Jul 2024 06:16:04 +0200 Subject: [PATCH 13/13] [NeMo-UX] Fix when optimizers are setup for PEFT (#9619) * Fix when optimizers are setup for PEFT * Apply isort and black reformatting Signed-off-by: marcromeyn * Init DDP inside PEFT * Apply isort and black reformatting Signed-off-by: marcromeyn * Some fixes, loss seems to become nan with peft for some reason * Apply isort and black reformatting Signed-off-by: marcromeyn * Loss goes down on fp32 * Apply isort and black reformatting Signed-off-by: marcromeyn * Simplifying FNMixin * Apply isort and black reformatting Signed-off-by: marcromeyn * Fix bug with new checkpoint-io * Apply isort and black reformatting Signed-off-by: marcromeyn * Fix failing test: test_peft_on_train_epoch_start_with_adapter * Apply isort and black reformatting Signed-off-by: marcromeyn --------- Signed-off-by: marcromeyn Co-authored-by: marcromeyn Co-authored-by: Chen Cui --- nemo/collections/llm/api.py | 2 +- nemo/collections/llm/fn/mixin.py | 17 +- nemo/lightning/_strategy_lib.py | 3 + nemo/lightning/io/connector.py | 8 +- nemo/lightning/megatron_parallel.py | 159 +++++++++++------- .../pytorch/callbacks/model_transform.py | 5 +- nemo/lightning/pytorch/callbacks/peft.py | 18 +- nemo/lightning/pytorch/optim/lr_scheduler.py | 1 - .../pytorch/plugins/mixed_precision.py | 6 +- nemo/lightning/pytorch/strategies.py | 40 +++-- .../lightning/pytorch/callbacks/test_peft.py | 18 +- 11 files changed, 177 insertions(+), 100 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 5c9703497597..0bb8f5fa46af 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -279,7 +279,7 @@ def _setup( model_transform: Optional[Union[PEFT, ModelTransform, Callable]], ) -> Any: # Return type is Any because app_state's type is not specified _log = log or NeMoLogger() - if resume and resume.adapter_path and _log.ckpt: + if resume and isinstance(model_transform, PEFT) and _log.ckpt: logging.info("Disabling try_restore_best_ckpt restoration for adapters") _log.ckpt.try_restore_best_ckpt = False diff --git a/nemo/collections/llm/fn/mixin.py b/nemo/collections/llm/fn/mixin.py index b32f66366bfb..c566c6e9d392 100644 --- a/nemo/collections/llm/fn/mixin.py +++ b/nemo/collections/llm/fn/mixin.py @@ -2,6 +2,7 @@ from typing_extensions import Self from nemo.collections.llm.fn import base as fn +from nemo.utils import logging class FNMixin: @@ -114,8 +115,12 @@ def freeze(self) -> None: """ assert isinstance(self, nn.Module), "self is not a nn.Module" - for param in self.parameters(): - param.requires_grad = False + params = list(self.parameters()) + if not params: + logging.info(f"No parameters found in module {self.__class__.__name__}") + else: + for param in params: + param.requires_grad = False def unfreeze(self) -> None: """ @@ -124,5 +129,9 @@ def unfreeze(self) -> None: """ assert isinstance(self, nn.Module), "self is not a nn.Module" - for param in self.parameters(): - param.requires_grad = True + params = list(self.parameters()) + if not params: + logging.info(f"No parameters found in module {self.__class__.__name__}") + else: + for param in params: + param.requires_grad = True diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index e6452de16512..3bd62ddce24a 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -515,4 +515,7 @@ def load_model_state_dict(megatron_parallel, checkpoint: Mapping[str, Any], stri elif count > n_nesting: to_remove = "module." * (count - n_nesting) _state_dict[key[len(to_remove) :]] = value + else: + _state_dict[key] = value + module.load_state_dict(_state_dict, strict=strict) diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 500d0203cfd4..8be630f163e0 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -160,12 +160,8 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None: output_path (Path): The path where the model checkpoint will be saved. trainer (pl.Trainer): The trainer with the strategy to save the model. """ - _setup_kwargs = {} - setup_signature = inspect.signature(trainer.strategy.setup) - if 'setup_optimizers' in setup_signature.parameters: - _setup_kwargs["setup_optimizers"] = False - - trainer.strategy.setup(trainer, **_setup_kwargs) + trainer.strategy._setup_optimizers = False + trainer.strategy.setup(trainer) trainer.save_checkpoint(output_path) def nemo_load( diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 2f2308717004..ee41455544bb 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -12,7 +12,6 @@ Iterable, Iterator, List, - Mapping, Optional, Protocol, Sequence, @@ -129,7 +128,6 @@ def __init__( cpu: bool = False, convert_module_fn: Optional[Callable[[ModelT], nn.Module]] = None, ) -> None: - from apex.transformer.tensor_parallel.layers import set_defaults_if_not_set_tensor_model_parallel_attributes from megatron.core import parallel_state _pipeline: List[nn.Module] @@ -152,67 +150,15 @@ def __init__( _model.configure_model() _pipeline.append(_model) - if convert_module_fn: - for i in range(len(_pipeline)): - _pipeline[i] = convert_module_fn(_pipeline[i]) - - if isinstance(ddp_config, DistributedDataParallelConfig): - for model_chunk_idx, model_chunk in enumerate(_pipeline): - module = model_chunk.module - - ddp = DDP( - module.config, - ddp_config, - module, - data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), - expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), - # Turn off bucketing for model_chunk 2 onwards, since communication for these - # model chunks is overlapped with compute anyway. - disable_bucketing=(model_chunk_idx > 0), - ) - model_chunk.module = ddp - model_chunk.buffers = ddp.buffers # We need to do this explicitly since this is a attr pytorch uses - model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore - - # param_sync_func is set in nemo.lightning.pytorch.optim.megatron - no_sync_func, grad_sync_func = extract_ddp_funcs(ddp_config, _pipeline) - for module in _pipeline: - module.config.no_sync_func = no_sync_func - module.config.grad_sync_func = grad_sync_func - - for i, model_module in enumerate(_pipeline): - if not cpu: - model_module.cuda(torch.cuda.current_device()) - - for param in model_module.parameters(): - set_defaults_if_not_set_tensor_model_parallel_attributes(param) - - if hasattr(model_module, "configure_model"): - if not hasattr(model_module, "set_input_tensor"): - if hasattr(model_module.module, "set_input_tensor"): - model_module.set_input_tensor = model_module.module.set_input_tensor - else: - # TODO: What to do here? - pass - - # Print number of parameters. - if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0: - from nemo.utils import logging - - msg = ( - f" > number of parameters on (tensor, pipeline) model parallel rank " - f"({parallel_state.get_tensor_model_parallel_rank()}, {parallel_state.get_pipeline_model_parallel_rank()}): " - f"{_calc_number_of_params(_pipeline)}" - ) - logging.info(msg) - super().__init__(_pipeline) self.precision_plugin = precision_plugin + self._cpu = cpu self.callbacks = callbacks or CallbackConnector() self.data_step = data_step or default_data_step self.forward_step = forward_step or default_forward_step self.loss_reduction: MegatronLossReduction = loss_reduction self.ddp_config = ddp_config + self.convert_module_fn = convert_module_fn def forward( self, @@ -475,6 +421,82 @@ def infer_num_microbatches(self, data: Union[DataT, Iterator[DataT], List[Iterat raise ValueError("Cannot infer `num_microbatches` from data, please specify it manually") + def init_model_parallel(self): + from apex.transformer.tensor_parallel.layers import set_defaults_if_not_set_tensor_model_parallel_attributes + from megatron.core import parallel_state + + for model_module in self: + if not self._cpu: + model_module.cuda(torch.cuda.current_device()) + + for param in model_module.parameters(): + set_defaults_if_not_set_tensor_model_parallel_attributes(param) + + if hasattr(model_module, "configure_model"): + if not hasattr(model_module, "set_input_tensor"): + if hasattr(model_module.module, "set_input_tensor"): + model_module.set_input_tensor = model_module.module.set_input_tensor + else: + # TODO: What to do here? + pass + + # Print number of parameters. + if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0: + from nemo.utils import logging + + num_params = _calc_number_of_params(list(self)) + num_trainable_params = _calc_number_of_trainable_params(list(self)) + + msg = ( + f" > number of parameters on (tensor, pipeline) model parallel rank " + f"({parallel_state.get_tensor_model_parallel_rank()}, {parallel_state.get_pipeline_model_parallel_rank()}): " + f"{num_params}" + ) + logging.info(msg) + + if num_params != num_trainable_params: + logging.info( + f" > number of trainable parameters: {num_trainable_params} ({num_trainable_params / num_params:.2%} of total)" + ) + + if self.convert_module_fn: + self.apply_convert_module_fn() + + self.init_ddp() + + def apply_convert_module_fn(self): + for i in range(len(self)): + self[i] = self.convert_module_fn(self[i]) + + def init_ddp(self): + if not isinstance(self.ddp_config, DistributedDataParallelConfig): + return + + from megatron.core import parallel_state + + for model_chunk_idx, model_chunk in enumerate(self): + module = model_chunk.module + + ddp = DDP( + module.config, + self.ddp_config, + module, + data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0), + ) + model_chunk.module = ddp + model_chunk.buffers = ddp.buffers # We need to do this explicitly since this is a attr pytorch uses + model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore + + # param_sync_func is set in nemo.lightning.pytorch.optim.megatron + no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self) + for module in self: + module.config.no_sync_func = no_sync_func + module.config.grad_sync_func = grad_sync_func + def _build_context(self, context: Dict[str, Any]) -> Dict[str, Any]: if "self" in context: del context["self"] @@ -565,18 +587,21 @@ def forward_backward_func(self) -> "MegatronStepProtocol": @override def __getattr__(self, item: Any) -> Any: - if len(self) == 0: - return super().__getattr__(item) - try: - # __getattr__ gets called as a last resort if the attribute does not exist - # call nn.Module's implementation first + # First, try to get the attribute from the superclass (nn.ModuleList) return super().__getattr__(item) except AttributeError: - # If the attribute is not available on the _FabricModule wrapper, redirect to the wrapped nn.Module - attr = getattr(self._modules[self._get_abs_string_index(0)], item) + # If not found in superclass, check if we have any modules + if len(self) == 0: + raise AttributeError( + f"'{self.__class__.__name__}' object has no attribute '{item}' and contains no modules" + ) - return attr + # Try to get it from the first module + try: + return getattr(self._modules[self._get_abs_string_index(0)], item) + except AttributeError: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'") class _ModuleStepFunction: @@ -915,6 +940,12 @@ def _calc_number_of_params(model: List[nn.Module]) -> int: return sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model]) +def _calc_number_of_trainable_params(model: List[nn.Module]) -> int: + assert isinstance(model, list) + + return sum([sum([p.numel() for p in model_module.parameters() if p.requires_grad]) for model_module in model]) + + def is_list_of_iterators(var) -> bool: if not isinstance(var, list): return False diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py index 68b3db16f473..512324940133 100644 --- a/nemo/lightning/pytorch/callbacks/model_transform.py +++ b/nemo/lightning/pytorch/callbacks/model_transform.py @@ -65,7 +65,10 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo def _maybe_apply_transform(self, trainer): if self._needs_to_call: - self.model_transform(trainer.model) + self.apply_transform(trainer) + + def apply_transform(self, trainer): + self.model_transform(trainer.model) @property def _needs_to_call(self) -> bool: diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index 26325bf549d0..f8fa76110288 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -84,19 +84,27 @@ def __call__(self, model: nn.Module) -> nn.Module: def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None: super().setup(trainer, pl_module, stage=stage) + trainer.strategy.trainer = trainer self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io) trainer.strategy._checkpoint_io = self.wrapped_io + trainer.strategy._init_model_parallel = False + trainer.strategy._setup_optimizers = False - def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - needs_to_call = self._needs_to_call - self._maybe_apply_transform(trainer) + def apply_transform(self, trainer): + super().apply_transform(trainer) - # Check if we need to load the adapters - if needs_to_call and self.wrapped_io.adapter_ckpt_path is not None: + if self.wrapped_io.adapter_ckpt_path is not None: logging.info(f"Loading adapters from {self.wrapped_io.adapter_ckpt_path}") adapter_state = self.wrapped_io.load_checkpoint(self.wrapped_io.adapter_ckpt_path) trainer.strategy.load_model_state_dict(adapter_state, strict=False) + if hasattr(trainer.strategy, "init_model_parallel"): + logging.info("Initializing model parallel") + trainer.strategy.init_model_parallel() + + logging.info("Setting up optimizers") + trainer.strategy.setup_optimizers(trainer) + def on_load_checkpoint( self, trainer: pl.Trainer, pl_module: pl.LightningModule, checkpoint: Dict[str, Any] ) -> None: diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py index 298a6e7a7f45..9374328190a6 100644 --- a/nemo/lightning/pytorch/optim/lr_scheduler.py +++ b/nemo/lightning/pytorch/optim/lr_scheduler.py @@ -445,7 +445,6 @@ def scheduler(self, model, optimizer): return { "optimizer": optimizer, - "scheduler": lr_scheduler, "lr_scheduler": { # REQUIRED: The scheduler instance "scheduler": lr_scheduler, diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index 751141d8111b..5e43e09c0420 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -61,7 +61,6 @@ def convert_module(self, module: Module) -> Module: This is optional and depends on the precision limitations during optimization. """ - from megatron.core.distributed import DistributedDataParallel from megatron.core.transformer.module import Float16Module from megatron.core.utils import get_model_config @@ -69,7 +68,10 @@ def convert_module(self, module: Module) -> Module: config = get_model_config(module.module) config.fp16 = self.precision == "16-mixed" config.bf16 = self.precision == "bf16-mixed" - if not isinstance(module.module, Float16Module): + if isinstance(module.module, Float16Module): + new_float16_module = Float16Module(config, module.module.module) + module.module = new_float16_module + else: module.module = Float16Module(config, module.module) return module diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 6a84319b4fa2..d0e502839f2f 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -110,6 +110,8 @@ def __init__( ckpt_parallel_save=True, ckpt_parallel_load=False, ckpt_parallel_save_optim=True, + setup_optimizers: bool = True, + init_model_parallel: bool = True, **kwargs, ) -> None: super().__init__( @@ -132,6 +134,8 @@ def __init__( self.lazy_init = lazy_init self.ckpt_include_optimizer = ckpt_include_optimizer self.pipeline_dtype = pipeline_dtype + self._setup_optimizers = setup_optimizers + self._init_model_parallel = init_model_parallel self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1))) self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0))) @@ -144,7 +148,7 @@ def __init__( self._ddp = ddp if ddp == "megatron": - self.ddp_config = DistributedDataParallelConfig() + self.ddp_config = DistributedDataParallelConfig(check_for_nan_in_grad=True) elif isinstance(ddp, DistributedDataParallelConfig): self.ddp_config = ddp elif ddp == "pytorch": @@ -180,7 +184,7 @@ def connect(self, model: pl.LightningModule) -> None: ddp_config.use_distributed_optimizer = mcore_opt_config.use_distributed_optimizer @override - def setup(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None: + def setup(self, trainer: pl.Trainer) -> None: assert self.accelerator is not None self.accelerator.setup(trainer) self.trainer = trainer @@ -204,7 +208,7 @@ def setup(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None: self.data_sampler.connect(trainer) self._fix_progress_bar(trainer) - self.setup_megatron_parallel(trainer, setup_optimizers=setup_optimizers) + self.setup_megatron_parallel(trainer) self.setup_precision_plugin() if getattr(self.lightning_module, "model_transform", None): @@ -271,7 +275,7 @@ def process_dataloader(self, dataloader: DataLoader) -> DataLoader: return dataloader - def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None: + def setup_megatron_parallel(self, trainer: pl.Trainer) -> None: assert self.model is not None, "Model is not set" convert_module_fn = None @@ -286,6 +290,10 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool = ddp_config=self.ddp_config, convert_module_fn=convert_module_fn, ) + + if self._init_model_parallel: + self.init_model_parallel() + self.megatron_parallel.trainer = trainer # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel @@ -295,18 +303,9 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool = self.model.configure_optimizers, megatron_parallel=self.megatron_parallel ) - if setup_optimizers: + if self._setup_optimizers: self.setup_optimizers(trainer) - # TODO: Throw an execption if we have a mcore optimizer and no ddp_config - - if hasattr(self.precision_plugin, "convert_optimizer"): - _optimizers = [*self.optimizers] - _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) - self.optimizers = _optimizers - - _optimizers_to_device(self.optimizers, self.root_device) - self.model = self.megatron_parallel self.model.callbacks.add(getattr(trainer, "callbacks")) @@ -317,6 +316,9 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool = if datamodule: self.model.callbacks.add(datamodule) + def init_model_parallel(self): + self.megatron_parallel.init_model_parallel() + @override def configure_ddp(self) -> None: logging.debug(f"{self.__class__.__name__}: configuring MegatronParallel") @@ -349,6 +351,16 @@ def _setup_model(self, model: nn.Module) -> nn.Module: return model + @override + def setup_optimizers(self, trainer: "pl.Trainer") -> None: + super().setup_optimizers(trainer) + if hasattr(self.precision_plugin, "convert_optimizer"): + _optimizers = [*self.optimizers] + _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) + self.optimizers = _optimizers + + _optimizers_to_device(self.optimizers, self.root_device) + def _setup_parallel_ranks(self) -> None: self.set_world_ranks() env = cast(ClusterEnvironment, self.cluster_environment) diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py index 81dc7f85bc08..e64ee7bd0ba3 100644 --- a/tests/lightning/pytorch/callbacks/test_peft.py +++ b/tests/lightning/pytorch/callbacks/test_peft.py @@ -1,4 +1,4 @@ -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch import torch.nn as nn from nemo.collections.llm import fn @@ -54,8 +54,22 @@ def test_peft_on_train_epoch_start_with_adapter(self, mock_logging): peft.wrapped_io.load_checkpoint.return_value = {"dummy_state": "dummy_value"} peft.on_train_epoch_start(trainer, pl_module) - mock_logging.info.assert_called_once_with("Loading adapters from dummy_path") + # Check for all expected log messages + mock_logging.info.assert_has_calls( + [ + call("Loading adapters from dummy_path"), + call("Initializing model parallel"), + call("Setting up optimizers"), + ], + any_order=True, + ) + + # Verify the number of calls + assert mock_logging.info.call_count == 3 + trainer.strategy.load_model_state_dict.assert_called_once_with({"dummy_state": "dummy_value"}, strict=False) + trainer.strategy.init_model_parallel.assert_called_once() + trainer.strategy.setup_optimizers.assert_called_once_with(trainer) def test_peft_on_load_checkpoint(self): peft = self.DummyPEFT()