diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml new file mode 100644 index 000000000000..f4f37d7c4ce0 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml @@ -0,0 +1,191 @@ +name: megatron_mamba +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: False + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_mamba + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + + +model: + restore_from_path: null + # model parallelism + mcore_gpt: True + micro_batch_size: 1 + global_batch_size: 8 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + expert_model_parallel_size: 1 # expert model parallelism + hybrid_override_pattern: null + vocab_size: 256000 + # model architecture + encoder_seq_length: 4096 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. + num_layers: 56 + gated_linear_unit: False + add_bias_linear: False + num_query_groups: 8 + mamba_ssm_ngroups: 8 + attention_dropout: 0.0 + hidden_dropout: 0.0 + hidden_size: 4096 + ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 32 + transformer_block_type: pre_ln + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: RMSNorm + layernorm_epsilon: 1e-5 + num_moe_experts: 16 + moe_router_topk: 2 + moe_aux_loss_coeff: 0.001 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + megatron_legacy: False + persist_layer_norm: True + + tokenizer: + library: 'huggingface' + type: 'EleutherAI/gpt-neox-20b' + model: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + use_fast: True + + # Distributed checkpoint setup + dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU + dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope + + + # miscellaneous + seed: 1234 + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + sequence_parallel: False + + data: + # Path to data must be specified by the user. + # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + data_prefix: [1.0, /path/to/data] + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: mmap + splits_string: 900,50,50 + seq_length: ${model.encoder_seq_length} + skip_warmup: True + num_workers: 0 + dataloader_type: single # cyclic, LDDL + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + masked_lm_prob: 0.15 # Probability of replacing a token with mask. + short_seq_prob: 0.1 # Probability of producing a short sequence. + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + + optim: + name: distributed_fused_adam + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py new file mode 100644 index 000000000000..bc76b3215a74 --- /dev/null +++ b/examples/nlp/language_modeling/mamba_change_num_partition.py @@ -0,0 +1,696 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import tarfile +import tempfile +from argparse import ArgumentParser + +import torch +from omegaconf import open_dict +from pytorch_lightning import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel +from nemo.collections.nlp.parts.nlp_overrides import ( + NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.utils import logging +from nemo.utils.app_state import AppState + +""" +Usage: + +### Tensor Parallelism conversion ### + +# Megatron Mamba +python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \ + --model_file= \ + --target_file= \ + --tensor_model_parallel_size=1 \ + --target_tensor_model_parallel_size=4 \ + --precision=bf16 \ + --d-model=4096 \ + --mamba-version=2 \ + --mamba2-n-groups=8 \ + --mamba2-head-dim=64 +""" + +tp_split_dim = { + 'word_embeddings.weight': 0, + 'norm.weight': -1, + 'final_norm.weight': -1, + 'output_layer.weight': 0, + # mamba1/2 + 'A_log': 0, + 'D': 0, + 'dt_bias': 0, + 'in_proj.weight': 0, + 'conv1d.weight': 0, + 'conv1d.bias': 0, + 'x_proj.weight': 1, + 'dt_proj.weight': 0, + 'dt_proj.bias': 0, + 'out_proj.weight': 1, + 'mixer.norm.weight': 0, + # mlp + 'linear_fc1.layer_norm_weight': -1, + 'linear_fc1.weight': 0, + 'linear_fc2.weight': 1, + # attention + 'self_attention.linear_proj.weight': 1, + 'self_attention.linear_qkv.layer_norm_weight': -1, + 'self_attention.linear_qkv.weight': 0, +} + + +def get_split_dim(tensor_name): + # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish + if 'norm.weight' in tensor_name: + if 'mixer.norm.weight' in tensor_name: + return tp_split_dim['mixer.norm.weight'] + else: + return tp_split_dim['norm.weight'] + + for key in tp_split_dim.keys(): + if key in tensor_name: + return tp_split_dim[key] + raise Exception("Unknown tensor name {}".format(tensor_name)) + + +def split_tensor_for_tp(params, key, dim, tensor): + + tp_size = params.target_tensor_model_parallel_size + tensor_sliced = [] + if dim == -1: + tensor_sliced = [tensor for i in range(tp_size)] + else: + if 'mixer.in_proj.weight' in key and params.mamba_version == 1: + x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + z_sliced = torch.chunk(z, tp_size, dim=dim) + for x, z in zip(x_sliced, z_sliced): + tensor_sliced.append(torch.cat((x, z), dim=dim)) + + elif 'mixer.in_proj.weight' in key and params.mamba_version == 2: + x, z, B, C, dt = torch.split( + tensor, + [ + params.mamba_d_inner, + params.mamba_d_inner, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_heads, + ], + dim=dim, + ) + B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1])) + C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1])) + + B_sliced = torch.chunk(B, tp_size, dim=dim) + C_sliced = torch.chunk(C, tp_size, dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + z_sliced = torch.chunk(z, tp_size, dim=dim) + dt_sliced = torch.chunk(dt, tp_size, dim=dim) + + tensor_sliced = [] + for x, z, B, C, dt in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced): + tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim)) + + elif 'mixer.conv1d' in key and params.mamba_version == 2: + x, B, C = torch.split( + tensor, + [ + params.mamba_d_inner, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_groups * params.mamba_d_state, + ], + dim=dim, + ) + if 'weight' in key: + B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1])) + C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1])) + elif 'bias' in key: + B = torch.reshape(B, (-1, params.mamba_d_state)) + C = torch.reshape(C, (-1, params.mamba_d_state)) + else: + raise Exception("Unknown key") + + B_sliced = torch.chunk(B, tp_size, dim=dim) + C_sliced = torch.chunk(C, tp_size, dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + + tensor_sliced = [] + for x, B, C in zip(x_sliced, B_sliced, C_sliced): + tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim)) + elif '_extra_state' in key: + pass + else: + tensor_sliced = torch.chunk(tensor, tp_size, dim=dim) + + return tensor_sliced + + +################# +### Utilities ### +################# + + +def force_cpu_model(cfg): + with open_dict(cfg): + # temporarily set to cpu + original_cpu_init = cfg.get('use_cpu_initialization', False) + if 'megatron_amp_O2' in cfg: + amp_o2_key = 'megatron_amp_O2' + original_amp_o2 = cfg.megatron_amp_O2 + elif 'megatron_amp_02' in cfg: + amp_o2_key = 'megatron_amp_02' + original_amp_o2 = cfg.megatron_amp_02 + else: + amp_o2_key, original_amp_o2 = None, None + + # Set new values + cfg.use_cpu_initialization = True + if amp_o2_key is not None: + cfg[amp_o2_key] = False + + # Disable sequence parallelism - Not disabling this gives error when converting the the model to TP=1 + original_sequence_parallel = cfg.get('sequence_parallel', None) + cfg.sequence_parallel = False + + # Setup restore dict + restore_dict = {'use_cpu_initialization': original_cpu_init} # 'megatron_amp_O2': original_amp_o2 + if amp_o2_key is not None: + restore_dict[amp_o2_key] = original_amp_o2 + if original_sequence_parallel is not None: + restore_dict['sequence_parallel'] = original_sequence_parallel + + return cfg, restore_dict + + +def restore_model_config(cfg, original_dict): + with open_dict(cfg): + for key, val in original_dict.items(): + logging.info(f"Restoring model config key ({key}) from {cfg[key]} to original value of {val}") + cfg[key] = val + return cfg + + +def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path): + """ + Function to write the given TP PP split to NeMo File. + + Save each of the TP ranks in reverse order + This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved + The final rank will then save a new NeMo file with all other ranks inside. + + Args: + model: The model corresponding to the current TP PP split. Contains partial parameters. + splits: Nested List of tensors containing the TP splits of the current model given current PP rank. + Indexed as splits[idx][tp_rank]. + app_state: AppState object. + tp_size: The global tensor-parallel size of the final model. + pp_rank: The local pipeline parallel rank of the final model. + write_path: The path to save the NeMo file. + """ + for tp_rank in range(tp_size - 1, -1, -1): + app_state.pipeline_model_parallel_rank = pp_rank + app_state.tensor_model_parallel_rank = tp_rank + + idx = 0 + for name, param in model.named_parameters(): + split_val = splits[idx][tp_rank].clone() + + if param.shape != split_val.shape: + raise RuntimeError( + f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}." + ) + + param.data = split_val + idx += 1 + + if write_path is not None: + logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}") + model.save_to(write_path) + + +################## +### Converters ### +################## + + +def split_tp_partition_only(args, model, original_model, tp_size, write_path=None, megatron_legacy=False): + + if tp_size < 1: + raise ValueError("TP size must to be >= 1.") + + app_state = AppState() + app_state.data_parallel_rank = 0 + app_state.pipeline_model_parallel_size = 1 + app_state.tensor_model_parallel_size = tp_size + app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size + + app_state.pipeline_model_parallel_rank = 0 + app_state.tensor_model_parallel_rank = tp_size - 1 + + idx = 0 + splits = [] + + for ii, (key, original_tensor) in enumerate(original_model.model.state_dict().items()): + try: + layer_num = int(re.findall(r'\d+', key)[0]) + new_key = key.replace(str(layer_num), str(layer_num), 1) + except: + new_key = key + + if '_extra_state' not in new_key: + split_dim = get_split_dim(new_key) + split = split_tensor_for_tp(args, new_key, split_dim, original_tensor) + + splits.append(split) + idx += 1 + + # Save each of the TP ranks in reverse order + # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved + # The final rank will then save a new NeMo file with all other ranks inside. + write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path) + + with tarfile.open(write_path, 'r') as tar: + # Extract all contents to the specified path + tar.extractall(path=os.path.dirname(write_path)) + + +def main(): + parser = ArgumentParser() + parser.add_argument("--model_file", type=str, default=None, required=False, help="Path to source .nemo file") + parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file") + parser.add_argument( + "--tensor_model_parallel_size", type=int, default=-1, required=False, help="TP size of source model" + ) + parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model") + parser.add_argument( + '--pipeline_model_parallel_size', type=int, default=1, required=False, help='PP size of source model' + ) + parser.add_argument( + '--target_pipeline_model_parallel_size', type=int, required=False, default=1, help='PP size of target model' + ) + parser.add_argument( + '--target_pipeline_model_parallel_split_rank', type=int, default=0, help='PP rank to split for Enc-Dec models' + ) + parser.add_argument( + '--virtual_pipeline_model_parallel_size', type=int, default=None, help='Virtual Pipeline parallelism size' + ) + parser.add_argument( + '--ckpt_name', type=str, default=None, help='Checkpoint name to load from for Virtual Parallel' + ) + parser.add_argument( + "--model_class", + type=str, + default="nemo.collections.nlp.models.language_modeling.megatron_mamba_model.MegatronMambaModel", + help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel", + ) + parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag") + parser.add_argument('--num_gpu_per_node', default=8, type=int, help='Number of GPUs per node') + parser.add_argument( + "--megatron_legacy", + action="store_true", + help="Converter for legacy megatron modles that have different q,k,v weight splits", + ) + parser.add_argument( + "--tokenizer_model_path", + type=str, + required=False, + default=None, + help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.", + ) + parser.add_argument( + "--tokenizer_vocab_file", + type=str, + required=False, + default=None, + help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.", + ) + parser.add_argument('--hparams_file', type=str, default=None, help='Path to hparams file from PTL training') + parser.add_argument( + '--tp_conversion_only', default=True, action='store_true', help='Only convert TP model to TP model' + ) + parser.add_argument('--model_extracted_dir', type=str, default=None, help='Path to pre-extracted model directory') + + parser.add_argument('--d-model', type=int, default=4096) + parser.add_argument('--mamba-version', type=int, default=2) + parser.add_argument('--mamba-d-state', type=int, default=128) + parser.add_argument('--mamba2-n-groups', type=int, default=8) + parser.add_argument('--mamba2-head-dim', type=int, default=64) + + args = parser.parse_args() + + args.mamba_d_inner = args.d_model * 2 + args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim + + precision = args.precision + num_gpu_per_node = int(args.num_gpu_per_node) + if args.precision in ["32", "16"]: + precision = int(float(args.precision)) + + if precision in ["bf16", "bf16-mixed"]: + if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + pass + else: + logging.warning("BF16 is not supported on this device. Using FP16 instead.") + precision = precision[2:] + + if precision == 32: + dtype = torch.float32 + elif precision in [16, "16", "16-mixed"]: + dtype = torch.float16 + elif precision in ["bf16", "bf16-mixed"]: + dtype = torch.bfloat16 + else: + dtype = torch.float32 # fallback + + # Built target directory if it does not exist + target_dir = os.path.split(args.target_file)[0] + if not os.path.exists(target_dir): + os.makedirs(target_dir, exist_ok=True) + + tp_size = args.tensor_model_parallel_size + tgt_tp_size = args.target_tensor_model_parallel_size + pp_size = args.pipeline_model_parallel_size + tgt_pp_size = args.target_pipeline_model_parallel_size + pipeline_model_parallel_split_rank = args.target_pipeline_model_parallel_split_rank + vp_size = args.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + convert_vp = vp_size > 1 + if convert_vp: + from megatron.core import parallel_state + + parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size) + + hparams_filepath = args.hparams_file + if hparams_filepath is None: + logging.warning( + '\n\n\n!!!!!!!!!\n' + 'You are converting a model with virtual pipeline parallelism enabled, \n' + 'but have not passed `hparams_file` argument. \n' + 'This will cause each ckpt file to be temporarily laoded onto GPU memory!\n\n' + 'It is highly recommended to pass `hparams_file` argument to avoid this.\n' + ) + + # Import the class of the model + + if args.model_file is None and args.model_extracted_dir is None: + raise ValueError("Cannot pass model_file and model_extracted_dir as None at the same time.") + + tmp_cfg = MegatronMambaModel.restore_from( + restore_path=args.model_file, + trainer=Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision), + map_location=torch.device("cpu"), + return_config=True, + ) + plugins = [] + if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: + scaler = None + if precision in [16, '16', '16-mixed']: + scaler = GradScaler( + init_scale=tmp_cfg.get('native_amp_init_scale', 2**32), + growth_interval=tmp_cfg.get('native_amp_growth_interval', 1000), + hysteresis=tmp_cfg.get('hysteresis', 2), + ) + # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' + + if tmp_cfg.get('megatron_amp_O2', False): + plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu") + + if tp_size < 0 or pp_size < 0: + logging.info(f"Loading model config from {args.model_file} to get TP and PP size") + model_config_internal = MegatronMambaModel.restore_from( + restore_path=args.model_file, + trainer=trainer, + map_location=torch.device("cpu"), + return_config=True, + ) + + tp_size = model_config_internal.get('tensor_model_parallel_size', 1) + pp_size = model_config_internal.get('pipeline_model_parallel_size', 1) + + # Check if TP conversion only + tp_conversion_only = args.tp_conversion_only + if tp_conversion_only: + logging.info("Converting TP model to TP model only") + + if pp_size > 1: + raise ValueError("Provided `--tp_conversion_only` but `--pipeline_model_parallel_size` > 1") + + if tgt_pp_size > 1: + raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_size` > 1") + + if pipeline_model_parallel_split_rank > 0: + raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_split_rank` > 0") + + # Force PP size to 1 + pp_size = 1 + tgt_pp_size = 1 + pipeline_model_parallel_split_rank = 0 + + if vp_size is None or vp_size < 0: + vp_size = 1 + + app_state = AppState() + app_state.data_parallel_rank = 0 + app_state.pipeline_model_parallel_size = pp_size + app_state.tensor_model_parallel_size = tp_size + + if vp_size > 1: + app_state.virtual_pipeline_model_parallel_size = vp_size + app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size + + world_size = pp_size * tp_size # pseudo world size for simulating load of a specific rank on a single gpu + + app_state.tensor_model_parallel_rank = 0 + app_state.pipeline_model_parallel_rank = 0 + + # Extract tokenizer artifact from the model to temp directory + logging.info("Extracting tokenizer artifact from NeMo file...") + temp_dir = tempfile.mkdtemp() + tokenizer_model_path = None + with tarfile.open(args.model_file, "r") as tar: + for member in tar.getmembers(): + if '.model' in member.name: + extracted_file = tar.extractfile(member) + extracted_file_path = os.path.join(temp_dir, member.name) + + if tokenizer_model_path is None: + logging.info(f"Found tokenizer. Extracting {member.name} to {extracted_file_path}") + + tokenizer_model_path = extracted_file_path + with open(extracted_file_path, "wb") as f: + f.write(extracted_file.read()) + else: + if args.tokenizer_model_path is None: + logging.warning( + f"\n\nFound multiple tokenizer artifacts in the model file.\n" + f"Using only {tokenizer_model_path}.\n" + f"If this is incorrect, manually pass the correct tokenizer using " + f"`--tokenizer_model_path`.\n\n" + ) + + # If input model has TP > 1 or PP > 1 + # Reconstruct the model to have TP = 1 and PP = 1 + # Note that this is a forward loop that will process PP [0..N] TP [0..M] in sequential order. + + # If input model has TP = 1 and PP = 1 + app_state.model_parallel_size = 1 + + save_restore_connector = NLPSaveRestoreConnector() + + if args.model_extracted_dir is not None: + logging.info(f"Using extracted model directory: {args.model_extracted_dir}") + save_restore_connector.model_extracted_dir = args.model_extracted_dir + + if args.model_file is not None: + model_filepath = args.model_file + else: + model_filepath = args.model_extracted_dir + + tmp_cfg = MegatronMambaModel.restore_from( + restore_path=model_filepath, + trainer=trainer, + map_location=torch.device("cpu"), + save_restore_connector=save_restore_connector, + return_config=True, + ) + + tmp_cfg, restore_dict = force_cpu_model(tmp_cfg) + + model = MegatronMambaModel.restore_from( + restore_path=model_filepath, + trainer=trainer, + map_location=torch.device("cpu"), + save_restore_connector=save_restore_connector, + override_config_path=tmp_cfg, + ) + + original_model = MegatronMambaModel.restore_from( + restore_path=model_filepath, + trainer=trainer, + map_location=torch.device("cpu"), + save_restore_connector=save_restore_connector, + override_config_path=tmp_cfg, + ) + original_model = original_model.to('cpu') + original_model._save_restore_connector = NLPSaveRestoreConnector() + original_model.freeze() + original_model.to(dtype=dtype) + + model.to(dtype=dtype) + + restore_model_config(model.cfg, restore_dict) + + # If target model has TP > 1 or PP > 1 + if tgt_pp_size > 1 or tgt_tp_size > 1: + + # Preserve the TP 1 PP 1 model parameters and names + global_params = [] + global_params.append([p for n, p in model.named_parameters()]) # params + global_params.append([n for n, p in model.named_parameters()]) # names + + logging.debug("Global parameters:") + for idx, (name, p) in enumerate(zip(global_params[1], global_params[0])): + logging.debug(f"{name} - {p.shape}") + + logging.info(f"TP 1 PP 1 Number of Parameters : {len(global_params[0])}") + + world_size = ( + tgt_pp_size * tgt_tp_size + ) # pseudo world size for simulating load of a specific rank on a single gpu + new_global_batch_size = model.cfg.micro_batch_size * world_size + old_global_batch_size = model.cfg.get('global_batch_size', model.cfg.micro_batch_size) + + global_offset = len(global_params[0]) - 1 # -1 cause this indexes the array, range [0, L-1] + logging.info(f"Final layer offset for parameters: {global_offset}") + + for pp_rank in range(tgt_pp_size - 1, -1, -1): # reverse order + + with open_dict(model.cfg): + model.cfg.pipeline_model_parallel_size = tgt_pp_size + model.cfg.tensor_model_parallel_size = tgt_tp_size + + if 'pipeline_model_parallel_split_rank' in model.cfg: + if pipeline_model_parallel_split_rank > 0: + model.cfg.pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank + elif pp_size > 1: + logging.warning( + f"Model config has `pipeline_model_parallel_split_rank` set to " + f"{model.cfg.pipeline_model_parallel_split_rank} and target PP " + f"size is {tgt_pp_size}. " + f"Provided `pipeline_model_parallel_split_rank` is " + f"{pipeline_model_parallel_split_rank}. " + f"Be careful that the model config is correct " + f"if encoder-decoder models are being converted." + ) + + model.cfg.global_batch_size = old_global_batch_size # Used for restoration + + # Override flag that forces Model to use AppState instead of Trainer + # to determine the world size, global and local rank + # Used for simulating load of a specific rank on a single gpu + os.environ[NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE] = "true" + + # Compute the global rank + global_rank = ( + pp_rank * tgt_tp_size + 0 + ) # tp_rank = 0 needed just for modules, all TP will be merged to this PP rank + + # Update AppState + app_state.world_size = world_size + app_state.global_rank = global_rank + app_state.local_rank = global_rank % num_gpu_per_node + app_state.pipeline_model_parallel_size = tgt_pp_size + app_state.tensor_model_parallel_size = tgt_tp_size + app_state.model_parallel_size = ( + app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size + ) + + trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu") + if args.tokenizer_model_path is not None: + with open_dict(model.cfg): + model.cfg.tokenizer.model = args.tokenizer_model_path + + else: + if tokenizer_model_path is None: + logging.warning("Could not extract tokenizer model file from checkpoint.") + + else: + # Extract tokenizer info + with open_dict(model.cfg): + model.cfg.tokenizer.model = tokenizer_model_path + + model.cfg, restore_dict = force_cpu_model(model.cfg) + + from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + + _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size = 1 + _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_micro_batch_size = 1 + model.cfg.global_batch_size = 1 + model.cfg.micro_batch_size = 1 + + model = MegatronMambaModel(model.cfg, trainer) + model = model.to('cpu') + model._save_restore_connector = NLPSaveRestoreConnector() + model.freeze() + model.to(dtype=dtype) + + restore_model_config(model.cfg, restore_dict) + + # Update global batch size + if old_global_batch_size % new_global_batch_size != 0 or old_global_batch_size < new_global_batch_size: + logging.info( + f"Global batch size {old_global_batch_size} is not divisible by new global batch size {new_global_batch_size}." + f" The model config will be updated with new global batch size {new_global_batch_size}." + ) + with open_dict(model.cfg): + model.cfg.global_batch_size = new_global_batch_size + + logging.info(f"Global rank: {global_rank} Local rank: {app_state.local_rank} World size: {world_size}") + logging.info(f"PP rank: {pp_rank} TP rank: {0}") + logging.info(f"TP 1 PP 1 Number of Layers : {len(global_params[0])}") + logging.info(f"Remaining layer offset for parameters: {global_offset}") + logging.info("\n") + + # Special case for TP conversion only mode + if tp_conversion_only: + logging.info(f"Skipping PP split due to flag `--tp_conversion_only`") + split_tp_partition_only( + args, model, original_model, tgt_tp_size, args.target_file, args.megatron_legacy + ) + break + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml new file mode 100644 index 000000000000..3684b61bb186 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml @@ -0,0 +1,315 @@ +name: megatron_mamba +restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + limit_val_batches: 1024 + limit_test_batches: 500 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: True + wandb_logger_kwargs: + project: griffin + name: sft-test + resume_if_exists: False + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + + +model: + restore_from_path: null + # model parallelism + mcore_gpt: True + micro_batch_size: 1 + global_batch_size: 8 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + expert_model_parallel_size: 1 # expert model parallelism + + vocab_size: 65536 + # model architecture + encoder_seq_length: 4096 + hybrid_override_pattern: null + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. + num_layers: 64 + gated_linear_unit: False + add_bias_linear: False + num_query_groups: 8 + ngroups_mamba: 8 + attention_dropout: 0.0 + hidden_dropout: 0.0 + hidden_size: 4096 + ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 32 + transformer_block_type: pre_ln + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: RMSNorm + layernorm_epsilon: 1e-5 + num_moe_experts: 16 + moe_router_topk: 2 + moe_aux_loss_coeff: 0.001 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + megatron_legacy: False + persist_layer_norm: True + + + # mixed-precision + attention_softmax_in_fp32: False + + # Distributed checkpoint setup + dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU + dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint + + + tokenizer: + library: 'huggingface' + type: 'EleutherAI/gpt-neox-20b' + model: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + use_fast: True + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope + + # miscellaneous + seed: 1234 + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + sequence_parallel: False + + peft: + peft_scheme: "lora" # can be either adapter,ia3, lora, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) + adapter_dim: 32 + alpha: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: null # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + validation_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + label_key: ${model.data.train_ds.label_key} + add_eos: ${model.data.train_ds.add_eos} + add_sep: ${model.data.train_ds.add_sep} + add_bos: ${model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + label_key: ${model.data.train_ds.label_key} + add_eos: ${model.data.train_ds.add_eos} + add_sep: ${model.data.train_ds.add_sep} + add_bos: ${model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: distributed_fused_adam + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml new file mode 100644 index 000000000000..2d34aefffc7e --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml @@ -0,0 +1,298 @@ +name: megatron_mamba +restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: False + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_mamba + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + +model: + restore_from_path: null + # model parallelism + mcore_gpt: True + micro_batch_size: 2 + global_batch_size: 2 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + expert_model_parallel_size: 1 # expert model parallelism + hybrid_override_pattern: null + vocab_size: 65536 + # model architecture + encoder_seq_length: 4096 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. + num_layers: 64 + gated_linear_unit: False + num_query_groups: 8 + ngroups_mamba: 8 + attention_dropout: 0.0 + hidden_dropout: 0.0 + hidden_size: 4096 + ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 32 + transformer_block_type: pre_ln + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: RMSNorm + layernorm_epsilon: 1e-5 + num_moe_experts: 16 + moe_router_topk: 2 + moe_aux_loss_coeff: 0.001 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + megatron_legacy: False + persist_layer_norm: True + add_bias_linear: False + + answer_only_loss: True + + tokenizer: + library: 'huggingface' + type: 'EleutherAI/gpt-neox-20b' + model: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + use_fast: True + + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope + + + # miscellaneous + seed: 1234 + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + sequence_parallel: False + + peft: + peft_scheme: null # can be either adapter,ia3, lora, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) + adapter_dim: 32 + alpha: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + data: + test_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: ??? # Names of the corresponding datasets used to log metrics. + global_batch_size: 1 + micro_batch_size: 1 + shuffle: False + num_workers: 0 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "input" # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + +inference: + greedy: True # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + outfile_path: output.txt + compute_attention_mask: True + +# server-related configs +server: False # whether launch the API server +port: 5555 # the port number for the inference server +web_server: False # whether launch the web inference server +share: True # whether create a public URL +username: test # user name for web client +password: test2 # password for web client +web_port: 9889 # the port number of the web server 1058 +chat: False # use the chat interface +chatbot_config: + value: False # whether to inject the value attributes + attributes: + - name: Quality + min: 0 + max: 4 + key: quality + type: int + default: 4 + - name: Toxicity + min: 0 + max: 4 + key: toxcity + type: int + default: 0 + - name: Humor + min: 0 + max: 4 + key: humor + type: int + default: 0 + - name: Creativity + min: 0 + max: 4 + key: creativity + type: int + default: 0 + - name: Violence + min: 0 + max: 4 + key: violence + type: int + default: 0 + - name: Helpfulness + min: 0 + max: 4 + key: helpfulness + type: int + default: 4 + - name: Not_Appropriate + min: 0 + max: 4 + key: not_appropriate + type: int + default: 0 + - name: Language + choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] + key: lang + type: list + default: en + + user: User + assistant: Assistant + system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" \ No newline at end of file diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py new file mode 100644 index 000000000000..0613ef486ec3 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf + +from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + +mp.set_start_method("spawn", force=True) + + +@hydra_runner(config_path="conf", config_name="megatron_mamba_finetuning_config") +def main(cfg) -> None: + + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + precision = cfg.trainer.precision + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + # Restore the precision value after Trainer is built. + cfg.trainer.precision = precision + exp_manager(trainer, cfg.exp_manager) + + model_cfg = MegatronMambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) + model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + + peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] + + if cfg.model.peft.restore_from_path is not None: + # initialize peft weights from a check`point instead of randomly + # This is not the same as resume training because optimizer states are not restored. + logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) + model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg)) + elif peft_cfg_cls is not None: + logging.info("Adding adapter weights to the model for PEFT") + model.add_adapter(peft_cfg_cls(model_cfg)) + else: + logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py new file mode 100644 index 000000000000..6f660d552fc6 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf +from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.model_utils import inject_model_parallel_rank + + +mp.set_start_method("spawn", force=True) + + +@hydra_runner(config_path="conf", config_name="megatron_mamba_generate_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + + if cfg.model.peft.restore_from_path: + model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) + else: + model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) + + model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + + if cfg.model.peft.restore_from_path: + model.load_adapters(cfg.model.peft.restore_from_path) + elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: + peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] + checkpoint_path = os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + # checkpoint_path is a dir in case of distributed checkpointing + if not os.path.isdir(checkpoint_path): + # legacy checkpoint needs model parallel rank injection + checkpoint_path = inject_model_parallel_rank( + os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + ) + model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg)) + else: + raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") + + model.freeze() + logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") + + trainer.test(model) + + +if __name__ == "__main__": + main() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py new file mode 100644 index 000000000000..fb8a04b947b0 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +# from megatron.core.models.mamba import MambaModel +# from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec +from omegaconf.dictconfig import DictConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.utils import logging + + +class MegatronMambaModel(MegatronGPTModel): + """ + Megatron Mamba pretraining. + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + + self.vocab_size = cfg.get('vocab_size', 65536) + self.cfg = cfg + super().__init__(cfg=cfg, trainer=trainer) + logging.warning("Overriding mcore_gpt=True") + self.mcore_gpt = True + + def model_provider_func(self, pre_process, post_process): + + self.hybrid_override_pattern = self.cfg.get( + 'hybrid_override_pattern', "M" * self.transformer_config.num_layers + ) + self.transformer_config.add_bias_linear = self.cfg.get('add_bias_linear', False) + self.transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', False) + self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5) + + # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged + # TODO @ataghibakhsh: add the following + '''MambaModel( + config=self.transformer_config, + max_sequence_length=self.cfg.get('encoder_seq_length', 4096), + vocab_size=self.cfg.get('vocab_size', 65536), + mamba_stack_spec=mamba_stack_spec, + hybrid_override_pattern=self.hybrid_override_pattern, + )''' + # after package mismatch is resovled + model = None + + return model + + def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None): + + output_tensor = self.model( + input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, labels=labels + ) + return output_tensor + + def build_transformer_config(self): + transformer_config = super().build_transformer_config() + return transformer_config + + def on_validation_epoch_end(self): + + averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda() + return averaged_loss + + def sharded_state_dict(self, prefix: str = ''): + return None + + def _reset_activation_checkpointing_args(self): + return + + def _restore_activation_checkpointing_args(self): + return + + def _reset_sequence_parallelism_args(self): + return + + def _restore_sequence_parallelism_args(self): + return diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py new file mode 100644 index 000000000000..ebcc47004711 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from omegaconf import DictConfig +from omegaconf.dictconfig import DictConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel +from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel + + +__all__ = ['MegatronMambaSFTModel'] + + +class MegatronMambaSFTModel(MegatronGPTSFTModel, MegatronMambaModel): + """ + Megatron Jamba Supervised Fine-Tuning + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + + super().__init__(cfg, trainer=trainer) + self.mcore_gpt = True + self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) + + def _reset_activation_checkpointing_args(self): + pass + + def on_validation_model_zero_grad(self) -> None: + """ + Skip gradient zeroing at the beginning of validation routine. + This is needed when overlapping the AllGather of the updated parameters with the following valdation step. + """ + if not self.validation_param_sync_overlap: + MegatronBaseModel.on_validation_model_zero_grad(self) diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index e8e2859e439f..808c684208a5 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -981,6 +981,7 @@ def model_inference_strategy_dispatcher(model, **args): MegatronGPTPromptLearningModel, ) from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel + from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel from nemo.collections.nlp.modules.common.retro_inference_strategies import ( @@ -991,6 +992,8 @@ def model_inference_strategy_dispatcher(model, **args): if isinstance(model, MegatronGriffinModel): return GriffinModelTextGenerationStrategy(model) + if isinstance(model, MegatronMambaModel): + return GPTModelTextGenerationStrategy(model) if isinstance(model, MegatronNevaModel): return NevaModelTextGenerationStrategy(model) if isinstance(model, MegatronGPTPromptLearningModel): diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 7d294f6085bb..34ca175470ab 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -17,6 +17,7 @@ from typing import List, Optional, Union import torch +from megatron.core.transformer.identity_op import IdentityOp from omegaconf import DictConfig, OmegaConf, open_dict from nemo.utils.model_utils import inject_model_parallel_rank @@ -178,9 +179,10 @@ def _check_and_add_peft_cfg(self, peft_cfg): for layer in layers: if layer.layer_number in (layer_selection or list(range(1, self.cfg.num_layers + 1))): for name, module in layer.named_modules(): - self._check_and_add_adapter( - name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins - ) + if not isinstance(module, IdentityOp): + self._check_and_add_adapter( + name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins + ) else: # Non GPT models, as well as GPT+PTuning do not support layer selection if layer_selection is not None: diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 494a9ab6d672..d006ccb7ad65 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -10,6 +10,7 @@ gdown h5py ijson jieba +mamba-ssm==1.2.0.post1 markdown2 matplotlib>=3.3.2 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py new file mode 100644 index 000000000000..9a44f9c2c5c4 --- /dev/null +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -0,0 +1,159 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from argparse import ArgumentParser +from collections import defaultdict +import torch +from omegaconf.omegaconf import OmegaConf +from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils import logging + +''' +Example + +CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ + --input_name_or_path \ + --output_path \ + --ngroups_mamba 8 \ + --precision bf16 +''' + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--hparams_file", + type=str, + default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_mamba_config.yaml", + required=False, + help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument( + "--input_name_or_path", + type=str, + required=True, + ) + parser.add_argument("--ngroups_mamba", type=int, default=8, help="ngroups for Mamba model") + parser.add_argument( + "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" + ) + args = parser.parse_args() + return args + + +def convert(args): + + checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model'] + new_state_dict = {} + + if 'backbone' in list(checkpoint_weights.keys())[0]: + + layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)] + layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys) + num_layers = max(layer_numbers) + 1 + + direct_mappings = { + 'model.embedding.word_embeddings.weight': 'backbone.embedding.weight', + 'model.decoder.final_norm.weight': 'backbone.norm_f.weight', + 'model.output_layer.weight': 'lm_head.weight', + } + + for new_key, old_key in direct_mappings.items(): + new_state_dict[new_key] = checkpoint_weights[old_key] + + layer_attributes = [ + 'mixer.A_log', + 'mixer.D', + 'mixer.conv1d.weight', + 'mixer.conv1d.bias', + 'mixer.in_proj.weight', + 'mixer.dt_bias', + 'mixer.out_proj.weight', + 'mixer.norm.weight', + 'norm.weight', + ] + + for i in range(num_layers): + for attr in layer_attributes: + new_key = f'model.decoder.layers.{i}.{attr}' + old_key = f'backbone.layers.{i}.{attr}' + new_state_dict[new_key] = checkpoint_weights[old_key] + + else: + + layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)] + layer_numbers = set(int(re.search(r'decoder\.layers\.(\d+)\.', key).group(1)) for key in layer_keys) + num_layers = max(layer_numbers) + 1 + + new_state_dict = {"model." + key: value for key, value in checkpoint_weights.items()} + + layers = defaultdict(list) + + for key in new_state_dict.keys(): + match = re.match(r'model\.decoder\.layers\.(\d+)\.(\w+)', key) + if match: + index, layer_type = match.groups() + layers[index].append(layer_type) + + layer_pattern = '' + for i in range(max(map(int, layers.keys())) + 1): + index_str = str(i) + layer_types = layers.get(index_str, []) + if 'mixer' in layer_types: + layer_pattern += 'M' + elif 'self_attention' in layer_types: + layer_pattern += '*' + elif 'mlp' in layer_types: + layer_pattern += '-' + else: + raise AssertionError("Layer not found. Each layer must be eiher MLP, Mamba, or Attention") + + nemo_config = OmegaConf.load(args.hparams_file) + nemo_config.trainer["precision"] = args.precision + nemo_config.model.vocab_size, nemo_config.model.hidden_size = new_state_dict[ + 'model.embedding.word_embeddings.weight' + ].shape + nemo_config.model.num_layers = num_layers + nemo_config.model.hybrid_override_pattern = layer_pattern + nemo_config.model.ngroups_mamba = args.ngroups_mamba + + if "-" in layer_pattern: + nemo_config.model.ffn_hidden_size = new_state_dict[ + f'model.decoder.layers.{layer_pattern.index("-")}.mlp.linear_fc1.weight' + ].shape[0] + else: + nemo_config.model.ffn_hidden_size = nemo_config.model.hidden_size + + nemo_config.model.use_cpu_initialization = True + + logging.info(f"Loading Mamba2 Pytorch checkpoint : `{args.input_name_or_path}`") + + trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer() + nemo_model_from_pyt = MegatronMambaModel(nemo_config.model, trainer) + + nemo_model_from_pyt.load_state_dict(new_state_dict, strict=True) + dtype = torch_dtype_from_precision(args.precision) + nemo_model_from_pyt = nemo_model_from_pyt.to(dtype=dtype) + nemo_model_from_pyt.save_to(args.output_path) + logging.info(f'Mamba2 NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + args = get_args() + convert(args) diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst new file mode 100644 index 000000000000..c09a6ae03087 --- /dev/null +++ b/tutorials/llm/mamba/mamba.rst @@ -0,0 +1,301 @@ +Mamba2 and Mamba2-Transformer Hybrid Models Fine-Tuning +======================================================= + +`State Space Models (SSMs) `__ have recently emerged as a promising alternative to transformers. SSMs offer advantages such as linear time complexity relative to sequence length and a constant cache size for inference. These features enable the processing of longer sequences and higher throughput. Despite these benefits, SSMs alone may fall short compared to transformers on tasks that demand strong copying or in-context learning capabilities. + +To harness the strengths of both approaches, SSM-Hybrid models incorporate MLP, Transformer, and SSM blocks in their architecture. As highlighted in `a study by NVIDIA `__, these hybrid models outperform traditional transformers of the same size by achieving faster inference times due to the inclusion of SSM blocks. Based on experimental results, Mamba2-Hybrid models not only surpass transformer baselines in performance but also benefit from increased computational efficiency. + +The Mamba2 models discussed in the `Transformers are SSMs `__ paper are available in five different sizes: 130 million, 370 million, 780 million, 1.3 billion, and 2.7 billion parameters. The Mamba2-Hybrid models, along with their Mamba2 baseline as released by `NVIDIA `__, are provided in an 8 billion parameter size. + +`Low-Rank Adaptation (LoRA) `__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. LoRA tuning can be applied to the linear layers in the Transformer and MLP blocks for the Mamba2-Hybrid models. + +`NVIDIA NeMo +Framework `__ provides tools to perform Fine-tuning on Mamba2 and Mamba2-Hybrid to fit your use case. + +Requirements +------------- + +In order to proceed, ensure that you have met the following requirements: + +* Full Fine-Tuning System Configuration + * Small models (130m, 370m, 780m) + * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 40GB, for example: 1 x A6000-40GB. + + * Mid-size models (1.3b, 2.7b) + * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB. + + * Large models (8b) + * Access to at least 2 NVIDIA GPUs with a cumulative memory of at least 80GB, for example: 2 x H100-80GB or 2 x A100-80GB. + +* LoRA Fine-Tuning (Mamba2-Hybrid only) System Configuration + * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB. + + + +* A Docker-enabled environment, with `NVIDIA Container Runtime `_ installed, which will make the container GPU-aware. + + +* `Authenticate with NVIDIA NGC `_, and download `NGC CLI Tool `_. + + +Step-by-step Guide for Fine-Tuning +---------------------------------- + +Checkpoints from HuggingFace +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Obtain the desired checkpoint from HuggigFace. + +* `Repository `__ for the Mamba2 models from the `Transformers are SSMs paper `__. +* `Repository `__ for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__. + + +Convert the Pytorch Checkpoint to a NeMo Checkpoint +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Get into NVIDIA Container + +2. Run the conversion script from . For this conversion script, you should provide the PyTorch state dictionary of the model for ``input_name_or_path``, i.e. this argument only accepts a single ``state_dict``. + +.. code:: bash + + CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ + --input_name_or_path \ + --output_path \ + --ngroups_mamba 8 \ + --precision bf16 + +* Note: the ``ngroups_mamba`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper `__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__ (both 8b). + +Model (Tensor) Parallelism for the 8b Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Note: Distributed checkpointing for the Mamba2 and Mamba2-Hybrid models will be implemented in the near future. For now, you should use the method below for converting to Tensor Parallel (TP) of different sizes. + +The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ``.nemo`` checkpoint obtained for the previous step. To shard the model weights for a larger TP size, use the script from