Skip to content

Commit

Permalink
IA3 support for GPT and T5 (#4909)
Browse files Browse the repository at this point in the history
* init commit for ia3 adater training in GPT

Signed-off-by: arendu <adithya.r@gmail.com>

* ia3 adater training in GPT, models and adapter classes

Signed-off-by: arendu <adithya.r@gmail.com>

* reshape to operate even on non-contiguous tensors

Signed-off-by: arendu <adithya.r@gmail.com>

* configs

Signed-off-by: arendu <adithya.r@gmail.com>

* fixed none init

Signed-off-by: arendu <adithya.r@gmail.com>

* adding adapter and ia3 support for T5 based models

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* config update and t5 model adapter and ia3

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused imports

Signed-off-by: arendu <adithya.r@gmail.com>

* predict step for inference

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* adapter inference for t5

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* fixed bug micro and global batch size in eval

Signed-off-by: arendu <adithya.r@gmail.com>

* minor edit

Signed-off-by: arendu <adithya.r@gmail.com>

* agressive truncation if in test examples if no truncation field is given

Signed-off-by: arendu <adithya.r@gmail.com>

* corrected for language_model_path name changes in main

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused import

Signed-off-by: arendu <adithya.r@gmail.com>

* name change for language_model_path

Signed-off-by: arendu <adithya.r@gmail.com>

* include inter_attention to IA3

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix in confg

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fixes

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused flag

Signed-off-by: arendu <adithya.r@gmail.com>

* addressing PR comments

Signed-off-by: arendu <adithya.r@gmail.com>

* address PR comments

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* CI test

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix in jenkinsfile

Signed-off-by: arendu <adithya.r@gmail.com>

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
arendu and pre-commit-ci[bot] authored Oct 3, 2022
1 parent bd1209c commit cb2793c
Show file tree
Hide file tree
Showing 22 changed files with 1,923 additions and 94 deletions.
40 changes: 40 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,46 @@ pipeline {

}
}
stage('L2: Megatron T5 IA3 TP=2') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel{
stage('T5 IA3 tuning & inference TP=2 PP=1') {
steps {
sh "python examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py \
--config-name=megatron_t5_ia3_tuning_config \
name='/home/TestData/nlp/ia3_tuning/test_tp2_pp1' \
trainer.devices=2 \
trainer.max_steps=6 \
trainer.val_check_interval=2 \
trainer.max_epochs=null \
model.tensor_model_parallel_size=2 \
model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
model.existing_tasks=[] \
model.new_tasks=['rte'] \
model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \
model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \
model.global_batch_size=4"
sh "python examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py \
--config-name=megatron_t5_ia3_inference \
adapter_model_file='/home/TestData/nlp/ia3_tuning/test_tp2_pp1.nemo' \
language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
trainer.devices=2 \
tensor_model_parallel_size=2 \
data.global_batch_size=2 \
data.micro_batch_size=2 \
data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl']"
sh "rm -rf /home/TestData/nlp/ia3_tuning/test_tp2_pp1.nemo"
sh "rm -rf /home/TestData/nlp/ia3_tuning/test_tp2_pp1"
}
}
}
}
stage('L2: Megatron GPT Adapter TP=2') {
when {
anyOf {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ trainer:
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 10
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
max_epochs: -1
max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 0.2
accumulate_grad_batches: 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
inference:
greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature: 1.0 # sampling temperature
add_BOS: True # add the bos token at the begining of the prompt
tokens_to_generate: 30 # The minimum length of the sequence to be generated.
all_probs: False # whether return the log prob for all the tokens in vocab
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False


trainer:
devices: 1
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 16 # 16, 32, or bf16

tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
gpt_model_file: ??? # GPT nemo file path # used when starting from a .nemo file
adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py)
output_file: null # save predictions to this file
checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
hparams_file: null # model configuration file, only used for PTL checkpoint loading
data_paths: ??? # prompts for GPT inference
batch_size: 8
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: ia3_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1
max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 0.2
accumulate_grad_batches: 1
gradient_clip_val: 1.0
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
benchmark: False


exp_manager:
explicit_log_dir: null
exp_dir: null
name: ${name}
create_wandb_logger: null
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 1
mode: min
save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.nemo_path set below,
filename: 'megatron_gpt_ia3_tuning--{val_loss:.3f}-{step}'
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True

model:
seed: 1234
nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts
encoder_seq_length: 2048
gradient_as_bucket_view: false
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
global_batch_size: 8
micro_batch_size: 4

restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the GPT language model .nemo file, always required
existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
new_tasks: ["rte"] # List of new tasknames to be prompt-tuned

task_templates: # Add more/replace tasks as needed, these are just examples
- taskname: "boolq" # The task name
prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
answer_only_loss: True
answer_field: "answer"

- taskname: "intent_and_slot"
prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}"
total_virtual_tokens: 0
answer_only_loss: True
virtual_token_splits: []
truncate_field: null

- taskname: "rte"
prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_only_loss: True
answer_field: "answer"

- taskname: "squad"
prompt_template: "context: {context} question: {question} answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_only_loss: True
answer_field: "answer"

- taskname: "arc-challenge"
prompt_template: "question: {question} choices: {choices} answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_only_loss: True
answer_field: "answer"

- taskname: "xsum"
prompt_template: "{source} Summary: {target}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_only_loss: True
answer_field: "target"

data:
train_ds: ??? # expects a list of paths to training data files
validation_ds: ??? # expects a paths to validation data files
add_eos: True
shuffle: True
num_workers: 8
pin_memory: True


optim:
name: fused_adam
lr: 1e-4
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 50
constant_steps: 0 # Constant steps should also be 0 when min_lr=0
min_lr: 0.0 # min_lr must be 0.0 for prompt learning
monitor: val_loss
reduce_on_plateau: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
inference:
greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature: 1.0 # sampling temperature
add_BOS: True # add the bos token at the begining of the prompt
tokens_to_generate: 30 # The minimum length of the sequence to be generated.
all_probs: False # whether return the log prob for all the tokens in vocab
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False


trainer:
devices: 1
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 16 # 16, 32, or bf16

data:
test_ds: ???
num_workers: 1
global_batch_size: 4
micro_batch_size: 4

tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file
adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py)
output_file: null # save predictions to this file
checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
hparams_file: null # model configuration file, only used for PTL checkpoint loading
batch_size: 8

Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_adapter_dim${model.adapter_tuning.adapter_dim}

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 16
logger: False
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1
max_steps: 100
log_every_n_steps: 10
val_check_interval: 20
accumulate_grad_batches: 1
gradient_clip_val: 1.0
resume_from_checkpoint: null
benchmark: False

exp_manager:
explicit_log_dir: null
exp_dir: null
name: ${name}
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 1
mode: min
save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below
filename: "megatron_t5_adapter_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True

model:
seed: 1234
nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts
encoder_seq_length: 2048
gradient_as_bucket_view: false
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
global_batch_size: 8
micro_batch_size: 4

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
existing_tasks: []
new_tasks: ["squad"]

task_templates:
- taskname: "boolq" # The task name
prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
answer_field: "answer"

- taskname: "intent_and_slot"
prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null

- taskname: "rte"
prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_field: "answer"

- taskname: "squad"
prompt_template: "context: {context} question: {question} answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_field: "answer"

- taskname: "arc-challenge"
prompt_template: "question: {question} choices: {choices} answer: {answer}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_field: "answer"

- taskname: "xsum"
prompt_template: "{source} Summary: {target}"
total_virtual_tokens: 0
virtual_token_splits: []
truncate_field: null
answer_field: "target"

adapter_tuning:
type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
adapter_dim: 50
adapter_dropout: 0.1
norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm']

data:
train_ds: ???
validation_ds: ???
add_eos: True
shuffle: True
num_workers: 8
pin_memory: True


optim:
name: fused_adam
lr: 1e-3
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 50
constant_steps: 0
min_lr: 0.0
monitor: val_loss
reduce_on_plateau: false


Loading

0 comments on commit cb2793c

Please sign in to comment.