merge main

Signed-off-by: Evelina <ebakhturina@nvidia.com>
NVIDIA · Jun 1, 2023 · 9a19637 · 9a19637
2 parents a031e9b + 2b6777f
commit 9a19637
Show file tree

Hide file tree

Showing 10 changed files with 57 additions and 11 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3407,7 +3407,30 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             trainer.num_nodes=1"
       }
     }
-
+    stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps{
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
+            model.peft.restore_from_path=null \
+            model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
+            model.data.test_ds.names=['test'] \
+            model.data.test_ds.global_batch_size=1 \
+            model.data.test_ds.micro_batch_size=1 \
+            model.data.test_ds.tokens_to_generate=30 \
+            model.data.test_ds.max_seq_length=6000 \
+            inference.greedy=True \
+            inference.repetition_penalty=1.0 \
+            inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \
+            rm -rf examples/nlp/language_modeling/out.jsonl"
+      }
+    }
     stage('L2: Megatron GPT Prompt Tuning TP1 PP1') {
       when {
         anyOf {

diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -263,7 +263,7 @@ def main(cfg) -> None:
     print(response)
     print("***************************")
 
-    # Second method of running text generation, call trainer.predict
+    # Second method of running text generation, call trainer.predict [recommended]
     ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
     request_dl = DataLoader(dataset=ds, batch_size=2)
     config = OmegaConf.to_container(cfg.inference)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml
@@ -127,4 +127,4 @@ inference:
   repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  outfile_path: /home/adithyare/exp/foo.txt
+  outfile_path: output.txt
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
@@ -29,7 +29,7 @@ exp_manager:
     monitor: validation_${model.data.validation_ds.metric.name}
     save_top_k: 2
     mode: max
-    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below, 
+    save_nemo_on_train_end: False 
     filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
     model_parallel_size: ${model.tensor_model_parallel_size}
     save_best_model: True

diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
@@ -61,6 +61,8 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
         gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
         gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0)
         gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
+        sft_cls = MegatronGPTSFTModel
+        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
 
         # This is needed when modifying a hparam file directly to load `.ckpt` files.
         # This is not needed to modify the cfg in `.nemo` files.
@@ -167,6 +169,10 @@ def main(cfg) -> None:
 
     trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
 
+    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
+    with open_dict(cfg):
+        cfg.model.precision = cfg.trainer.precision
+
     if cfg.model.restore_from_path:
         save_restore_connector = NLPSaveRestoreConnector()
         if os.path.isdir(cfg.model.restore_from_path):
@@ -177,6 +183,7 @@ def main(cfg) -> None:
             return_config=True,
             save_restore_connector=save_restore_connector,
         )
+        gpt_cfg = _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False)
         model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config)
     else:
         validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -169,7 +169,7 @@ def _process_example(self, example):
         tokenized_text = pre_pad + self.tokenizer.text_to_ids(text)
         context_ids = pre_pad + self.tokenizer.text_to_ids(context)
         answer_ids = tokenized_text[len(context_ids) :]
-        total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
+        total_ids = len(context_ids) + len(answer_ids)
         if self.add_bos:
             total_ids += 1
         if self.add_sep:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -550,7 +550,13 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
             return compute_prob_response
         else:
             del inference_config['compute_logprob']
-            inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda())
+
+            # for megatron_gpt_eval.py
+            if isinstance(batch, list):
+                inference_config['inputs'] = batch
+            else:
+                # peft_eval.py
+                inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda())
             return generate(self, **inference_config)
 
     def write_predictions_to_file(self, outputs, output_file_path_prefix):

diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -290,7 +290,7 @@ def forward(self, *inputs, **kwargs):
         if getattr(self.module, 'pre_process', True):
             inputs = fp32_to_float16(inputs, self.float16_converter)
         outputs = self.module(*inputs, **kwargs)
-        if parallel_state.is_pipeline_last_stage():
+        if parallel_state.is_pipeline_last_stage() and self.training:
             outputs = float16_to_fp32(outputs)
         return outputs
 

diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -181,8 +181,11 @@ def __init__(self, model):
 
     def clip_max_len(self, maxlen: int) -> int:
         """ clip the max len based on the LM model max sequence length"""
-        if maxlen > self.model.cfg.encoder_seq_length + 1:
-            maxlen = self.model.cfg.encoder_seq_length + 1
+
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
         return maxlen
 
     def init_batch(self, context_tokens: torch.Tensor, context_length: int):

diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -606,6 +606,11 @@ def switch(val1, val2, boolean):
     return (1 - boolean) * val1 + boolean * val2
 
 
+def _convert_to_float(model):
+    # enable conversion to float when inference is done via model.generate() and PP > 1 (could results in larger memory consumption)
+    return model.cfg.get('pipeline_model_parallel_size', 1) > 1 and model._inference_config is None
+
+
 def sample_sequence_batch(
     model,
     inference_strategy,
@@ -667,10 +672,12 @@ def sample_sequence_batch(
             output = inference_strategy.forward_step(batch, tensor_shape)
 
             if parallel_state.is_pipeline_last_stage():
-                output = output[0]['logits'].float()
+                output = output[0]['logits']
+                if _convert_to_float(model):
+                    output = output.float()
+
                 output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
                 assert output is not None
-                output = output.float()
                 logits = output[:, -1].view(batch_size, -1).contiguous()
 
                 # make sure it will generate at least min_length