-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Started combined tensor parallel and pipeline parallel changes Signed-off-by: Virginia Adams <vadams@nvidia.com> * Gets through validation sanity checks Signed-off-by: Virginia Adams <vadams@nvidia.com> * Still working through bugs Signed-off-by: Virginia Adams <vadams@nvidia.com> * Able to run training but virtual token parameters don't get updated Signed-off-by: Virginia Adams <vadams@nvidia.com> * params weren't updating because they weren't setup w/ optimizer Signed-off-by: Virginia Adams <vadams@nvidia.com> * Parallel with single GPU is working! Signed-off-by: Virginia Adams <vadams@nvidia.com> * Tensor parallel = 2 is working Signed-off-by: Virginia Adams <vadams@nvidia.com> * Tensor parallel working and code cleaned up Signed-off-by: Virginia Adams <vadams@nvidia.com> * Added prompt tuning testing back in Signed-off-by: Virginia Adams <vadams@nvidia.com> * Complete method works again for prompt tuned mdoels Signed-off-by: Virginia Adams <vadams@nvidia.com> * removed random imports Signed-off-by: Virginia Adams <vadams@nvidia.com>
- Loading branch information
Showing
10 changed files
with
494 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
129 changes: 129 additions & 0 deletions
129
examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
name: PromptTuning | ||
restore_from_path: ??? # used when starting from a .nemo file | ||
|
||
trainer: | ||
gpus: 1 | ||
num_nodes: 1 | ||
accelerator: ddp | ||
precision: 32 | ||
logger: False # logger provided by exp_manager | ||
checkpoint_callback: False | ||
replace_sampler_ddp: False | ||
max_epochs: null | ||
max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
log_every_n_steps: 10 | ||
val_check_interval: 50 | ||
limit_val_batches: 50 | ||
limit_test_batches: 500 | ||
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models | ||
gradient_clip_val: null | ||
|
||
exp_manager: | ||
explicit_log_dir: null | ||
exp_dir: null | ||
name: PromptTuning | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: None | ||
name: None | ||
resume_if_exists: True | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
save_top_k: 3 | ||
mode: min | ||
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel | ||
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits | ||
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' | ||
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} | ||
|
||
|
||
model: | ||
# specify micro_batch_size, global_batch_size, and model parallelism | ||
# gradient accumulation will be done automatically based on data_parallel_size | ||
micro_batch_size: 4 # limited by GPU memory | ||
global_batch_size: 16 # will use more micro batches to reach global batch size | ||
tensor_model_parallel_size: 1 # intra-layer model parallelism | ||
pipeline_model_parallel_size: 1 # inter-layer model parallelism | ||
|
||
# model architecture | ||
encoder_seq_length: 2048 | ||
max_position_embeddings: ${.encoder_seq_length} | ||
num_layers: 12 | ||
hidden_size: 768 | ||
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. | ||
num_attention_heads: 12 | ||
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') | ||
hidden_dropout: 0.1 # Dropout probability for hidden state transformer. | ||
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | ||
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. | ||
layernorm_epsilon: 1e-5 | ||
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. | ||
pre_process: True # add embedding | ||
post_process: True # add pooler | ||
persist_layer_norm: True # Use of persistent fused layer norm kernel. | ||
gradient_as_bucket_view: False # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) | ||
|
||
tokenizer: | ||
library: 'megatron' | ||
type: 'GPT2BPETokenizer' | ||
model: null | ||
vocab_file: null | ||
merge_file: null | ||
|
||
# Prompt Tuning | ||
use_soft_prompts: True | ||
num_prompt_tokens: 150 | ||
existing_prompt_tags: [] | ||
new_prompt_tags: ??? | ||
new_prompt_init_text: ['some initialization text goes here'] | ||
new_prompt_init_methods: ['text'] | ||
calc_loss_on_answer_only: False | ||
|
||
|
||
# precision | ||
native_amp_init_scale: 4294967296 # 2 ** 32 | ||
native_amp_growth_interval: 1000 | ||
hysteresis: 2 # Gradient scale hysteresis | ||
fp32_residual_connection: False # Move residual connections to fp32 | ||
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 | ||
|
||
# Megatron O2-style half-precision | ||
megatron_amp_O2: False # Enable O2-level automatic mixed precision using master parameters | ||
|
||
# miscellaneous | ||
seed: 1234 | ||
use_cpu_initialization: False # Init weights on the CPU (slow for large models) | ||
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. | ||
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this | ||
|
||
activations_checkpoint_method: null # 'uniform', 'block' | ||
activations_checkpoint_num_layers: 1 | ||
|
||
data: | ||
data_prefix: None | ||
train_ds: ??? | ||
valid_ds: ??? | ||
data_impl: mmap | ||
splits_string: 900,50,50 | ||
seq_length: ${model.encoder_seq_length} | ||
skip_warmup: True | ||
num_workers: 0 | ||
dataloader_type: single # cyclic | ||
reset_position_ids: False # Reset position ids after end-of-document token | ||
reset_attention_mask: False # Reset attention mask after end-of-document token | ||
eod_mask_loss: False # Mask loss for the end of document tokens | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 2e-4 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 | ||
sched: | ||
name: CosineAnnealing | ||
warmup_steps: 50 | ||
constant_steps: 10 | ||
min_lr: 2e-5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.