Skip to content

Commit

Permalink
Parallel prompt tuning (#3670)
Browse files Browse the repository at this point in the history
* Started combined tensor parallel and pipeline parallel changes

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Gets through validation sanity checks

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Still working through bugs

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Able to run training but virtual token parameters don't get updated

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* params weren't updating because they weren't setup w/ optimizer

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Parallel with single GPU is working!

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Tensor parallel = 2 is working

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Tensor parallel working and code cleaned up

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Added prompt tuning testing back in

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Complete method works again for prompt tuned mdoels

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* removed random imports

Signed-off-by: Virginia Adams <vadams@nvidia.com>
  • Loading branch information
vadam5 authored and fayejf committed Mar 2, 2022
1 parent 8925335 commit 3e342f8
Show file tree
Hide file tree
Showing 10 changed files with 494 additions and 202 deletions.
47 changes: 21 additions & 26 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2085,43 +2085,38 @@ pipeline {
16"
}
}

stage('L2: Megatron GPT Prompt Tuning and Inference') {
when {
anyOf {
branch 'r1.6.1'
changeRequest target: 'r1.6.1'
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python tests/collections/nlp/test_prompt_tuning.py"
sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
--config-name=megatron_gpt_config \
trainer.gpus=1 \
trainer.max_steps=10 \
trainer.val_check_interval=1 \
exp_manager.name='megatron_gpt125M_prompt_tuning' \
exp_manager.checkpoint_callback_params.save_top_k=2 \
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
+model.use_soft_prompts=True \
+model.num_prompt_tokens=10 \
+model.new_prompt_tags=['Winogrande, BoolQ'] \
+model.new_prompt_init_text=['logic choose person name, None'] \
+model.new_prompt_init_methods=['text, random'] \
model.data.data_prefix=None \
+model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
+model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+model.data.batch_size=8 \
model.optim.lr=2e-2 \
model.optim.sched.min_lr=2e-3 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=8 \
model.encoder_seq_length=2048"
--config-name=megatron_prompt_tuning_gpt \
restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
trainer.val_check_interval=2 \
trainer.max_steps=5 \
model.new_prompt_tags=['Winogrande, BoolQ'] \
model.new_prompt_init_text=['logic choose person name, None'] \
model.new_prompt_init_methods=['text, random'] \
model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
model.micro_batch_size=2 \
model.global_batch_size=4 \
model.optim.lr=2e-2 \
model.optim.sched.min_lr=2e-3 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=8 \
model.encoder_seq_length=2048"
sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
--use_soft_prompts \
--model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
--model_file=nemo_experiments/PromptTuning/checkpoints/PromptTuning.nemo \
--tokens_to_generate=3 \
--prompt_tag='Winogrande' \
--prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"
Expand Down
Empty file modified examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
100644 → 100755
Empty file.
129 changes: 129 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
name: PromptTuning
restore_from_path: ??? # used when starting from a .nemo file

trainer:
gpus: 1
num_nodes: 1
accelerator: ddp
precision: 32
logger: False # logger provided by exp_manager
checkpoint_callback: False
replace_sampler_ddp: False
max_epochs: null
max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 50
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: null

exp_manager:
explicit_log_dir: null
exp_dir: null
name: PromptTuning
create_wandb_logger: False
wandb_logger_kwargs:
project: None
name: None
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 3
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}


model:
# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size
micro_batch_size: 4 # limited by GPU memory
global_batch_size: 16 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism

# model architecture
encoder_seq_length: 2048
max_position_embeddings: ${.encoder_seq_length}
num_layers: 12
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: False # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null

# Prompt Tuning
use_soft_prompts: True
num_prompt_tokens: 150
existing_prompt_tags: []
new_prompt_tags: ???
new_prompt_init_text: ['some initialization text goes here']
new_prompt_init_methods: ['text']
calc_loss_on_answer_only: False


# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using master parameters

# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1

data:
data_prefix: None
train_ds: ???
valid_ds: ???
data_impl: mmap
splits_string: 900,50,50
seq_length: ${model.encoder_seq_length}
skip_warmup: True
num_workers: 0
dataloader_type: single # cyclic
reset_position_ids: False # Reset position ids after end-of-document token
reset_attention_mask: False # Reset attention mask after end-of-document token
eod_mask_loss: False # Mask loss for the end of document tokens

optim:
name: fused_adam
lr: 2e-4
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 50
constant_steps: 10
min_lr: 2e-5
84 changes: 59 additions & 25 deletions examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,22 +14,31 @@

from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
from nemo.collections.nlp.parts.nlp_overrides import (
GradScaler,
MegatronHalfPrecisionPlugin,
NLPDDPPlugin,
PipelineMixedPrecisionPlugin,
)
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from nemo.utils.app_state import AppState
from nemo.utils.exp_manager import StatelessTimer, exp_manager


"""
Can currently only prompt tune on one task at a time, but can
run inference with multiple soft-prompts/tasks within a batch.
Datasets should be formatted with in a json file like:
{"prompt_tag": <tag1>, "text": <text1>}
{"prompt_tag": <tag1>, "text": <text2>}
{"prompt_tag": <tag2>, "text": <text3>}
{"prompt_tag": <tag1>, "text": <text1>, "answer": <answer1>}
{"prompt_tag": <tag1>, "text": <text2>, "answer": <answer2>}
{"prompt_tag": <tag1>, "text": <text3>, "answer": <answer3>}
Example Usage for first prompt tuning task:
Expand Down Expand Up @@ -139,38 +148,63 @@
"""


@hydra_runner(config_path="conf", config_name="megatron_gpt_config")
@hydra_runner(config_path="conf", config_name="megatron_prompt_tuning_gpt")
def main(cfg) -> None:
logging.info("\n\n************** Experiment configuration ***********")
logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]
megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
plugins = [
NLPDDPPlugin(
num_nodes=cfg.trainer.num_nodes,
no_ddp_communication_hook=True,
gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
)
]
if cfg.trainer.precision in [16, 'bf16']:
scaler = None
if cfg.trainer.precision == 16:
scaler = GradScaler(
init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
hysteresis=cfg.model.get('hysteresis', 2),
)
if megatron_amp_o2:
plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
else:
plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))

trainer = Trainer(plugins=plugins, **cfg.trainer)
if cfg.get('cluster_type', None) == 'BCP':
plugins.append(TorchElasticEnvironment())

trainer = Trainer(plugins=plugins, **cfg.trainer)
exp_manager(trainer, cfg.exp_manager)

app_state = AppState()
if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
(
app_state.tensor_model_parallel_rank,
app_state.pipeline_model_parallel_rank,
app_state.model_parallel_size,
_,
) = fake_initialize_model_parallel(
world_size=app_state.model_parallel_size,
rank=trainer.global_rank,
tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
)

# Override timer callback to a stateless one
for idx, callback in enumerate(trainer.callbacks):
if isinstance(callback, Timer):
trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision

model = MegatronGPTModel.restore_from(cfg.restore_from_path, cfg.model, trainer=trainer)

# Init all new prompts
for idx, tag in enumerate(cfg.model.new_prompt_tags):
init_method = cfg.model.new_prompt_init_methods[idx]

if init_method == "text":
init_text = cfg.model.new_prompt_init_text[idx]
model.init_prompt_from_text(tag, init_text)

elif init_method == 'random':
model.init_prompt_from_random(tag)

else:
logging.info(f'\n Soft prompt init method {init_method} is not recognized, please use text or random')

logging.info(f'\nCurrent soft prompts include {model.get_prompt_table()}')
trainer.fit(model)


Expand Down
Loading

0 comments on commit 3e342f8

Please sign in to comment.