Parallel prompt tuning (#3670)

* Started combined tensor parallel and pipeline parallel changes Signed-off-by: Virginia Adams <vadams@nvidia.com> * Gets through validation sanity checks Signed-off-by: Virginia Adams <vadams@nvidia.com> * Still working through bugs Signed-off-by: Virginia Adams <vadams@nvidia.com> * Able to run training but virtual token parameters don't get updated Signed-off-by: Virginia Adams <vadams@nvidia.com> * params weren't updating because they weren't setup w/ optimizer Signed-off-by: Virginia Adams <vadams@nvidia.com> * Parallel with single GPU is working! Signed-off-by: Virginia Adams <vadams@nvidia.com> * Tensor parallel = 2 is working Signed-off-by: Virginia Adams <vadams@nvidia.com> * Tensor parallel working and code cleaned up Signed-off-by: Virginia Adams <vadams@nvidia.com> * Added prompt tuning testing back in Signed-off-by: Virginia Adams <vadams@nvidia.com> * Complete method works again for prompt tuned mdoels Signed-off-by: Virginia Adams <vadams@nvidia.com> * removed random imports Signed-off-by: Virginia Adams <vadams@nvidia.com>
NVIDIA · Mar 2, 2022 · 3e342f8 · 3e342f8
1 parent 8925335
commit 3e342f8
Show file tree

Hide file tree

Showing 10 changed files with 494 additions and 202 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2085,43 +2085,38 @@ pipeline {
             16"
       }
     }
+
     stage('L2: Megatron GPT Prompt Tuning and Inference') {
       when {
 	anyOf {
-	  branch 'r1.6.1'
-	  changeRequest target: 'r1.6.1'
+	  branch 'main'
+	  changeRequest target: 'main'
 	}
       }
       failFast true
       steps {
 	sh "python tests/collections/nlp/test_prompt_tuning.py"
 	sh "python examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py \
-	   --config-name=megatron_gpt_config \
-	   trainer.gpus=1 \
-	   trainer.max_steps=10 \
-	   trainer.val_check_interval=1 \
-	   exp_manager.name='megatron_gpt125M_prompt_tuning' \
-	   exp_manager.checkpoint_callback_params.save_top_k=2 \
-	   exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
-	   restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
-	   +model.use_soft_prompts=True \
-	   +model.num_prompt_tokens=10 \
-           +model.new_prompt_tags=['Winogrande, BoolQ'] \
-	   +model.new_prompt_init_text=['logic choose person name, None'] \
-	   +model.new_prompt_init_methods=['text, random'] \
-           model.data.data_prefix=None \
-	   +model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
-	   +model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
-	   +model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
-	   +model.data.batch_size=8 \
-	   model.optim.lr=2e-2 \
-	   model.optim.sched.min_lr=2e-3 \
-	   model.optim.sched.warmup_steps=2 \
-	   model.optim.sched.constant_steps=8 \
-	   model.encoder_seq_length=2048"
+    --config-name=megatron_prompt_tuning_gpt \
+    restore_from_path='/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' \
+    trainer.val_check_interval=2 \
+    trainer.max_steps=5 \
+    model.new_prompt_tags=['Winogrande, BoolQ'] \
+    model.new_prompt_init_text=['logic choose person name, None'] \
+    model.new_prompt_init_methods=['text, random'] \
+    model.data.train_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_train.json' \
+    model.data.valid_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+    +model.data.test_ds='/home/TestData/nlp/prompt_tuning/wino_bool_prompt_tuning_val.json' \
+    model.micro_batch_size=2 \
+    model.global_batch_size=4 \
+    model.optim.lr=2e-2 \
+    model.optim.sched.min_lr=2e-3 \
+    model.optim.sched.warmup_steps=2 \
+    model.optim.sched.constant_steps=8 \
+    model.encoder_seq_length=2048"
 	sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
 	    --use_soft_prompts \
-	    --model_file=nemo_experiments/megatron_gpt125M_prompt_tuning/checkpoints/megatron_gpt125M_prompt_tuning.nemo \
+	    --model_file=nemo_experiments/PromptTuning/checkpoints/PromptTuning.nemo \
 	    --tokens_to_generate=3 \
 	    --prompt_tag='Winogrande' \
 	    --prompt='option1: wood option2: bag sentence: The _ is soft. answer:'"

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
diff --git a/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml b/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
@@ -0,0 +1,129 @@
+name: PromptTuning
+restore_from_path: ???  # used when starting from a .nemo file
+
+trainer:
+  gpus: 1
+  num_nodes: 1
+  accelerator: ddp
+  precision: 32
+  logger: False # logger provided by exp_manager
+  checkpoint_callback: False
+  replace_sampler_ddp: False
+  max_epochs: null
+  max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 50
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: null
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: PromptTuning
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: None
+    name: None
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 3
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 16 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  # model architecture
+  encoder_seq_length: 2048
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 12
+  hidden_size: 768
+  ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 12
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  layernorm_epsilon: 1e-5
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  gradient_as_bucket_view: False # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'megatron'
+    type: 'GPT2BPETokenizer'
+    model: null
+    vocab_file: null
+    merge_file: null 
+
+  # Prompt Tuning
+  use_soft_prompts: True
+  num_prompt_tokens: 150
+  existing_prompt_tags: []
+  new_prompt_tags: ???
+  new_prompt_init_text: ['some initialization text goes here']
+  new_prompt_init_methods: ['text']
+  calc_loss_on_answer_only: False
+
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using master parameters
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: 1 
+
+  data:
+    data_prefix: None
+    train_ds: ???
+    valid_ds: ???
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 10
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_tuning.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,22 +14,31 @@
 
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks.timer import Timer
+from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPPlugin,
+    PipelineMixedPrecisionPlugin,
+)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
+from nemo.utils.app_state import AppState
+from nemo.utils.exp_manager import StatelessTimer, exp_manager
 
 
 """
 Can currently only prompt tune on one task at a time, but can
 run inference with multiple soft-prompts/tasks within a batch.
 
 Datasets should be formatted with in a json file like:
-{"prompt_tag": <tag1>, "text": <text1>}
-{"prompt_tag": <tag1>, "text": <text2>}
-{"prompt_tag": <tag2>, "text": <text3>}
+{"prompt_tag": <tag1>, "text": <text1>, "answer": <answer1>}
+{"prompt_tag": <tag1>, "text": <text2>, "answer": <answer2>}
+{"prompt_tag": <tag1>, "text": <text3>, "answer": <answer3>}
 
 Example Usage for first prompt tuning task:
 
@@ -139,38 +148,63 @@
 """
 
 
-@hydra_runner(config_path="conf", config_name="megatron_gpt_config")
+@hydra_runner(config_path="conf", config_name="megatron_prompt_tuning_gpt")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]
+    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
+    plugins = [
+        NLPDDPPlugin(
+            num_nodes=cfg.trainer.num_nodes,
+            no_ddp_communication_hook=True,
+            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        )
+    ]
+    if cfg.trainer.precision in [16, 'bf16']:
+        scaler = None
+        if cfg.trainer.precision == 16:
+            scaler = GradScaler(
+                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
+                hysteresis=cfg.model.get('hysteresis', 2),
+            )
+        if megatron_amp_o2:
+            plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    if cfg.get('cluster_type', None) == 'BCP':
+        plugins.append(TorchElasticEnvironment())
 
+    trainer = Trainer(plugins=plugins, **cfg.trainer)
     exp_manager(trainer, cfg.exp_manager)
 
+    app_state = AppState()
+    if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
+        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
+        (
+            app_state.tensor_model_parallel_rank,
+            app_state.pipeline_model_parallel_rank,
+            app_state.model_parallel_size,
+            _,
+        ) = fake_initialize_model_parallel(
+            world_size=app_state.model_parallel_size,
+            rank=trainer.global_rank,
+            tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
+            pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
+        )
+
+    # Override timer callback to a stateless one
+    for idx, callback in enumerate(trainer.callbacks):
+        if isinstance(callback, Timer):
+            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
+
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
 
     model = MegatronGPTModel.restore_from(cfg.restore_from_path, cfg.model, trainer=trainer)
-
-    # Init all new prompts
-    for idx, tag in enumerate(cfg.model.new_prompt_tags):
-        init_method = cfg.model.new_prompt_init_methods[idx]
-
-        if init_method == "text":
-            init_text = cfg.model.new_prompt_init_text[idx]
-            model.init_prompt_from_text(tag, init_text)
-
-        elif init_method == 'random':
-            model.init_prompt_from_random(tag)
-
-        else:
-            logging.info(f'\n Soft prompt init method {init_method} is not recognized, please use text or random')
-
-    logging.info(f'\nCurrent soft prompts include {model.get_prompt_table()}')
     trainer.fit(model)