move to nvidia megatron repo (NVIDIA#6465) (NVIDIA#6475)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com> Co-authored-by: Abhinav Khattar <aklife97@gmail.com> Signed-off-by: hsiehjackson <c2hsieh@ucsd.edu>
hsiehjackson · Jun 2, 2023 · a8564d3 · a8564d3
1 parent 82c438b
commit a8564d3
Show file tree

Hide file tree

Showing 10 changed files with 16 additions and 2 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -44,8 +44,9 @@ RUN apt-get update && \
 
 WORKDIR /workspace/
 # Install Megatron-core
-RUN git clone https://github.com/aklife97/Megatron-LM.git && \
+RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
+  git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \
   pip install -e .
 
 WORKDIR /tmp/

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -60,8 +60,9 @@ pipeline {
     // TODO: remove when pip package is available
     stage('Megatron Core installation') {
       steps {
-        sh 'git clone https://github.com/aklife97/Megatron-LM.git && \
+        sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
             cd Megatron-LM && \
+            git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \
             pip install -e .'
       }
     }

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -311,6 +311,7 @@ def training_step(self, dataloader_iter, batch_idx):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         if losses_reduced_per_micro_batch:
@@ -411,6 +412,7 @@ def validation_step(self, dataloader_iter, batch_idx):
             tensor_shape=tensor_shape,
             dtype=self.autocast_dtype,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         if losses_reduced_per_micro_batch:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -300,6 +300,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -375,6 +375,7 @@ def training_step(self, dataloader_iter, batch_idx):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses
@@ -656,6 +657,7 @@ def validation_step(self, dataloader_iter, batch_idx):
             tensor_shape=tensor_shape,
             dtype=self.autocast_dtype,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stage of the pipeline returns losses

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -309,6 +309,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -328,6 +328,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             decoder_seq_length=self.max_decoder_seq_length,
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses
@@ -991,6 +992,7 @@ def dummy():
             num_microbatches=1,
             decoder_seq_length=encoder_seq_length,
             dtype=self.autocast_dtype,
+            enable_autocast=True,
         )
 
         if output_tensor:
@@ -1154,6 +1156,7 @@ def dummy():
                 num_microbatches=1,
                 decoder_seq_length=encoder_seq_length,
                 dtype=self.autocast_dtype,
+                enable_autocast=True,
             )
             # get output tensor
             if parallel_state.is_pipeline_last_stage():

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
@@ -197,6 +197,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses

diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -316,6 +316,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             dtype=self.autocast_dtype,
             grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             sequence_parallel=self.cfg.get('sequence_parallel', False),
+            enable_autocast=True,
         )
 
         # only the last stages of the pipeline return losses

diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -62,6 +62,7 @@ def forward_step(self, batch, tensor_shape):
             forward_only=True,
             tensor_shape=tensor_shape,
             dtype=self.model.autocast_dtype,
+            enable_autocast=True,
         )
 
         return output_tensor