Skip to content

Commit

Permalink
move to nvidia megatron repo (NVIDIA#6465) (NVIDIA#6475)
Browse files Browse the repository at this point in the history
Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: hsiehjackson <c2hsieh@ucsd.edu>
  • Loading branch information
2 people authored and hsiehjackson committed Jun 2, 2023
1 parent 82c438b commit a8564d3
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 2 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ RUN apt-get update && \

WORKDIR /workspace/
# Install Megatron-core
RUN git clone https://github.com/aklife97/Megatron-LM.git && \
RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \
pip install -e .

WORKDIR /tmp/
Expand Down
3 changes: 2 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ pipeline {
// TODO: remove when pip package is available
stage('Megatron Core installation') {
steps {
sh 'git clone https://github.com/aklife97/Megatron-LM.git && \
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \
pip install -e .'
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ def training_step(self, dataloader_iter, batch_idx):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

if losses_reduced_per_micro_batch:
Expand Down Expand Up @@ -411,6 +412,7 @@ def validation_step(self, dataloader_iter, batch_idx):
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

if losses_reduced_per_micro_batch:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ def training_step(self, dataloader_iter, batch_idx):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down Expand Up @@ -656,6 +657,7 @@ def validation_step(self, dataloader_iter, batch_idx):
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stage of the pipeline returns losses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
decoder_seq_length=self.max_decoder_seq_length,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down Expand Up @@ -991,6 +992,7 @@ def dummy():
num_microbatches=1,
decoder_seq_length=encoder_seq_length,
dtype=self.autocast_dtype,
enable_autocast=True,
)

if output_tensor:
Expand Down Expand Up @@ -1154,6 +1156,7 @@ def dummy():
num_microbatches=1,
decoder_seq_length=encoder_seq_length,
dtype=self.autocast_dtype,
enable_autocast=True,
)
# get output tensor
if parallel_state.is_pipeline_last_stage():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)

# only the last stages of the pipeline return losses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def forward_step(self, batch, tensor_shape):
forward_only=True,
tensor_shape=tensor_shape,
dtype=self.model.autocast_dtype,
enable_autocast=True,
)

return output_tensor
Expand Down

0 comments on commit a8564d3

Please sign in to comment.