-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Finetune T5 on the prefix-lm objective (#4328)
* Add script and yaml config Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> * Fix yaml config Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> * Style Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> * Update yaml to remove hardcoded model path Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
- Loading branch information
1 parent
968ee12
commit 416d033
Showing
2 changed files
with
242 additions
and
0 deletions.
There are no files selected for viewing
101 changes: 101 additions & 0 deletions
101
examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
name: megatron_t5_lm_adaptation_finetune | ||
restore_from_path: null # used when starting from a .nemo file | ||
|
||
trainer: | ||
devices: 1 | ||
num_nodes: 1 | ||
accelerator: gpu | ||
precision: 16 | ||
logger: False # logger provided by exp_manager | ||
enable_checkpointing: False | ||
replace_sampler_ddp: False | ||
max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. | ||
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
log_every_n_steps: 10 | ||
val_check_interval: 100 | ||
limit_val_batches: 50 | ||
limit_test_batches: 500 | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1.0 | ||
|
||
exp_manager: | ||
explicit_log_dir: null | ||
exp_dir: null | ||
name: megatron_t5_lm_adaptation_finetune | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: null | ||
name: null | ||
resume_if_exists: True | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
save_top_k: 10 | ||
mode: min | ||
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel | ||
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}' | ||
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} | ||
|
||
model: | ||
# pretrained model path | ||
pretrained_model_path: ??? | ||
|
||
# model parallelism | ||
micro_batch_size: 4 | ||
global_batch_size: 8 # will use more micro batches to reach global batch size | ||
tensor_model_parallel_size: 2 | ||
pipeline_model_parallel_size: 1 | ||
resume_from_checkpoint: null # manually set the checkpoint file to load from | ||
pipeline_model_parallel_split_rank: 1 | ||
|
||
# O2 mixed precision | ||
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. | ||
|
||
# JIT fusion params. | ||
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation. | ||
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. | ||
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. | ||
|
||
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) | ||
|
||
# Dropout | ||
hidden_dropout: null | ||
attention_dropout: null | ||
|
||
data: | ||
# Path to data must be specified by the user. | ||
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]", | ||
# Or see example below: | ||
# data_prefix: | ||
# - .5 | ||
# - /raid/data/pile/my-t5_00_text_document | ||
# - .5 | ||
# - /raid/data/pile/my-t5_01_text_document | ||
data_prefix: ??? | ||
index_mapping_dir: null | ||
data_impl: mmap | ||
splits_string: 949,45,5 | ||
seq_length: ${model.seq_length} | ||
seq_length_dec: 128 | ||
skip_warmup: True | ||
num_workers: 0 | ||
dataloader_type: single # cyclic | ||
masked_lm_prob: 0.15 | ||
dataset_type: 't5_prefix_lm' | ||
short_seq_prob: 0.0 | ||
max_ngram_size: 10 | ||
mean_ngram_size: null | ||
geometric_dist: True | ||
permutation: False | ||
whole_word_masking: True | ||
favor_longer_ngrams: False | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 5e-6 | ||
betas: | ||
- 0.9 | ||
- 0.999 | ||
eps: 1e-8 | ||
weight_decay: 0.01 |
141 changes: 141 additions & 0 deletions
141
examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
from omegaconf.omegaconf import OmegaConf, open_dict | ||
from pytorch_lightning import Trainer | ||
from pytorch_lightning.callbacks import ModelSummary | ||
from pytorch_lightning.callbacks.timer import Timer | ||
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment | ||
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector | ||
|
||
from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model | ||
from nemo.collections.nlp.parts.nlp_overrides import ( | ||
GradScaler, | ||
MegatronHalfPrecisionPlugin, | ||
NLPDDPPlugin, | ||
NLPSaveRestoreConnector, | ||
PipelineMixedPrecisionPlugin, | ||
) | ||
from nemo.core.config import hydra_runner | ||
from nemo.utils import logging | ||
from nemo.utils.exp_manager import StatelessTimer, exp_manager | ||
|
||
|
||
@hydra_runner(config_path="conf", config_name="megatron_t5_lm_adaptation_finetune") | ||
def main(cfg) -> None: | ||
logging.info("\n\n************** Experiment configuration ***********") | ||
logging.info(f'\n{OmegaConf.to_yaml(cfg)}') | ||
|
||
megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) | ||
plugins = [ | ||
NLPDDPPlugin( | ||
no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce | ||
gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, | ||
find_unused_parameters=False, | ||
) | ||
] | ||
if cfg.trainer.precision in [16, 'bf16']: | ||
scaler = None | ||
if cfg.trainer.precision == 16: | ||
scaler = GradScaler( | ||
init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), | ||
growth_interval=cfg.model.get('native_amp_growth_interval', 1000), | ||
hysteresis=cfg.model.get('hysteresis', 2), | ||
) | ||
if megatron_amp_o2: | ||
plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) | ||
else: | ||
plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) | ||
|
||
if cfg.get('cluster_type', None) == 'BCP': | ||
plugins.append(TorchElasticEnvironment()) | ||
|
||
trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) | ||
exp_manager(trainer, cfg.exp_manager) | ||
|
||
# update resume from checkpoint found by exp_manager | ||
if cfg.model.resume_from_checkpoint is not None: | ||
resume_from_checkpoint = cfg.model.resume_from_checkpoint | ||
else: | ||
resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path | ||
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') | ||
|
||
trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) | ||
# Override timer callback to a stateless one | ||
for idx, callback in enumerate(trainer.callbacks): | ||
if isinstance(callback, Timer): | ||
trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) | ||
|
||
# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams | ||
with open_dict(cfg): | ||
cfg.model.precision = cfg.trainer.precision | ||
|
||
if hasattr(cfg.model, 'pretrained_model_path') and cfg.model.pretrained_model_path is not None: | ||
pretrained_cfg = MegatronT5Model.restore_from( | ||
cfg.model.pretrained_model_path, trainer=trainer, return_config=True | ||
) | ||
OmegaConf.set_struct(pretrained_cfg, True) | ||
with open_dict(pretrained_cfg): | ||
|
||
# Override data from T5 to Prefix-LM | ||
encoder_seq_length = pretrained_cfg.data.seq_length | ||
decoder_seq_length = ( | ||
pretrained_cfg.data.seq_length | ||
) # Set decoder seq length to be enoder seq length for prefix-lm | ||
pretrained_cfg.data = cfg.model.data | ||
pretrained_cfg.data.seq_length = encoder_seq_length | ||
pretrained_cfg.data.seq_length_dec = ( | ||
decoder_seq_length - 1 | ||
) # -1 is to account for the addition of <bos> and <eos> and right shifting to create targets. | ||
|
||
# Override fusion params. | ||
pretrained_cfg.masked_softmax_fusion = cfg.model.masked_softmax_fusion | ||
pretrained_cfg.bias_dropout_add_fusion = cfg.model.bias_dropout_add_fusion | ||
pretrained_cfg.bias_gelu_fusion = cfg.model.bias_gelu_fusion | ||
|
||
# Override dropout | ||
if cfg.model.hidden_dropout is not None: | ||
pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout | ||
|
||
if cfg.model.attention_dropout is not None: | ||
pretrained_cfg.attention_dropout = cfg.model.attention_dropout | ||
|
||
# Override precision | ||
pretrained_cfg.precision = trainer.precision # Set above from trainer.precision | ||
|
||
# Override micro/global batch | ||
pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size | ||
pretrained_cfg.global_batch_size = cfg.model.global_batch_size | ||
|
||
# O2 AMP | ||
pretrained_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) | ||
|
||
# Optimizer overrides. | ||
pretrained_cfg.optim = cfg.model.optim | ||
|
||
model = MegatronT5Model.restore_from( | ||
cfg.model.pretrained_model_path, | ||
trainer=trainer, | ||
override_config_path=pretrained_cfg, | ||
save_restore_connector=NLPSaveRestoreConnector(), | ||
) | ||
else: | ||
raise ValueError(f'No pretrained model path specified or does not exist {cfg.model.pretrained_model_path}') | ||
|
||
trainer.fit(model) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |