IA3 support for GPT and T5 (#4909)

* init commit for ia3 adater training in GPT Signed-off-by: arendu <adithya.r@gmail.com> * ia3 adater training in GPT, models and adapter classes Signed-off-by: arendu <adithya.r@gmail.com> * reshape to operate even on non-contiguous tensors Signed-off-by: arendu <adithya.r@gmail.com> * configs Signed-off-by: arendu <adithya.r@gmail.com> * fixed none init Signed-off-by: arendu <adithya.r@gmail.com> * adding adapter and ia3 support for T5 based models Signed-off-by: arendu <adithya.r@gmail.com> * style fix Signed-off-by: arendu <adithya.r@gmail.com> * config update and t5 model adapter and ia3 Signed-off-by: arendu <adithya.r@gmail.com> * removed unused imports Signed-off-by: arendu <adithya.r@gmail.com> * predict step for inference Signed-off-by: arendu <adithya.r@gmail.com> * style fix Signed-off-by: arendu <adithya.r@gmail.com> * style fix Signed-off-by: arendu <adithya.r@gmail.com> * adapter inference for t5 Signed-off-by: arendu <adithya.r@gmail.com> * style fix Signed-off-by: arendu <adithya.r@gmail.com> * fixed bug micro and global batch size in eval Signed-off-by: arendu <adithya.r@gmail.com> * minor edit Signed-off-by: arendu <adithya.r@gmail.com> * agressive truncation if in test examples if no truncation field is given Signed-off-by: arendu <adithya.r@gmail.com> * corrected for language_model_path name changes in main Signed-off-by: arendu <adithya.r@gmail.com> * removed unused import Signed-off-by: arendu <adithya.r@gmail.com> * name change for language_model_path Signed-off-by: arendu <adithya.r@gmail.com> * include inter_attention to IA3 Signed-off-by: arendu <adithya.r@gmail.com> * minor fix in confg Signed-off-by: arendu <adithya.r@gmail.com> * minor fixes Signed-off-by: arendu <adithya.r@gmail.com> * removed unused flag Signed-off-by: arendu <adithya.r@gmail.com> * addressing PR comments Signed-off-by: arendu <adithya.r@gmail.com> * address PR comments Signed-off-by: arendu <adithya.r@gmail.com> * minor fix Signed-off-by: arendu <adithya.r@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fix Signed-off-by: arendu <adithya.r@gmail.com> * CI test Signed-off-by: arendu <adithya.r@gmail.com> * minor fix in jenkinsfile Signed-off-by: arendu <adithya.r@gmail.com> Signed-off-by: arendu <adithya.r@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Oct 3, 2022 · cb2793c · cb2793c
1 parent bd1209c
commit cb2793c
Show file tree

Hide file tree

Showing 22 changed files with 1,923 additions and 94 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -645,6 +645,46 @@ pipeline {
 
       }
     }
+    stage('L2: Megatron T5 IA3 TP=2') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      parallel{
+        stage('T5 IA3 tuning & inference TP=2 PP=1') {
+          steps {
+            sh "python examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py \
+                --config-name=megatron_t5_ia3_tuning_config \
+                name='/home/TestData/nlp/ia3_tuning/test_tp2_pp1' \
+                trainer.devices=2 \
+                trainer.max_steps=6 \
+                trainer.val_check_interval=2 \
+                trainer.max_epochs=null \
+                model.tensor_model_parallel_size=2 \
+                model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                model.existing_tasks=[] \
+                model.new_tasks=['rte'] \
+                model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \
+                model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \
+                model.global_batch_size=4"
+            sh "python examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py \
+                --config-name=megatron_t5_ia3_inference \
+                adapter_model_file='/home/TestData/nlp/ia3_tuning/test_tp2_pp1.nemo' \
+                language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                trainer.devices=2 \
+                tensor_model_parallel_size=2 \
+                data.global_batch_size=2 \
+                data.micro_batch_size=2 \
+                data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl']"
+            sh "rm -rf /home/TestData/nlp/ia3_tuning/test_tp2_pp1.nemo"
+            sh "rm -rf /home/TestData/nlp/ia3_tuning/test_tp2_pp1"
+          }
+        }
+      }
+    }
     stage('L2: Megatron GPT Adapter TP=2') {
       when {
         anyOf {

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_tuning_config.yaml
@@ -8,8 +8,8 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 10
-  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_epochs: -1
+  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 0.2
   accumulate_grad_batches: 1

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_inference.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_inference.yaml
@@ -0,0 +1,31 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
+gpt_model_file: ??? # GPT nemo file path # used when starting from a .nemo file
+adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py)
+output_file: null # save predictions to this file
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+data_paths: ??? # prompts for GPT inference
+batch_size: 8 
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_tuning_config.yaml
@@ -0,0 +1,127 @@
+name: ia3_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: -1
+  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 0.2
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  benchmark: False
+
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: null
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.nemo_path set below, 
+    filename: 'megatron_gpt_ia3_tuning--{val_loss:.3f}-{step}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts
+  encoder_seq_length: 2048 
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  global_batch_size: 8
+  micro_batch_size: 4
+
+  restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ??? # Path to the GPT language model .nemo file, always required
+  existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
+  new_tasks: ["rte"] # List of new tasknames to be prompt-tuned
+
+  task_templates: # Add more/replace tasks as needed, these are just examples
+  - taskname: "boolq" # The task name
+    prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
+    total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
+    virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
+    truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
+    answer_only_loss: True 
+    answer_field: "answer"
+
+  - taskname: "intent_and_slot"
+    prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}"
+    total_virtual_tokens: 0 
+    answer_only_loss: True 
+    virtual_token_splits: []
+    truncate_field: null
+
+  - taskname: "rte" 
+    prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  - taskname: "arc-challenge" 
+    prompt_template: "question: {question} choices: {choices} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  - taskname: "xsum" 
+    prompt_template: "{source} Summary: {target}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "target"
+
+  data:
+    train_ds: ??? # expects a list of paths to training data files
+    validation_ds: ???  # expects a paths to validation data files
+    add_eos: True
+    shuffle: True
+    num_workers: 8
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_inference.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_inference.yaml
@@ -0,0 +1,37 @@
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+
+data:
+  test_ds: ???
+  num_workers: 1
+  global_batch_size: 4
+  micro_batch_size: 4
+
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
+language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file
+adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py)
+output_file: null # save predictions to this file
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+batch_size: 8 
+
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_tuning_config.yaml
@@ -0,0 +1,131 @@
+name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_adapter_dim${model.adapter_tuning.adapter_dim}
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False 
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: -1
+  max_steps: 100
+  log_every_n_steps: 10
+  val_check_interval: 20
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null 
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below
+    filename: "megatron_t5_adapter_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts
+  encoder_seq_length: 2048
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1 
+  global_batch_size: 8
+  micro_batch_size: 4
+
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  existing_tasks: []
+  new_tasks: ["squad"] 
+
+  task_templates: 
+  - taskname: "boolq" # The task name
+    prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
+    total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
+    virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
+    truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
+    answer_field: "answer"
+
+  - taskname: "intent_and_slot"
+    prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}"
+    total_virtual_tokens: 0 
+    virtual_token_splits: []
+    truncate_field: null
+
+  - taskname: "rte" 
+    prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_field: "answer"
+
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_field: "answer"
+
+  - taskname: "arc-challenge" 
+    prompt_template: "question: {question} choices: {choices} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_field: "answer"
+
+  - taskname: "xsum" 
+    prompt_template: "{source} Summary: {target}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_field: "target"
+
+  adapter_tuning:
+    type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+    adapter_dim: 50
+    adapter_dropout: 0.1
+    norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+    column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+
+  data:
+    train_ds: ???
+    validation_ds: ???
+    add_eos: True
+    shuffle: True
+    num_workers: 8
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0
+      min_lr: 0.0
+      monitor: val_loss
+      reduce_on_plateau: false
+
+