Skip to content

Commit

Permalink
Merge branch 'r2.0.0rc1' of github.com:NVIDIA/NeMo into ashors/remove…
Browse files Browse the repository at this point in the history
…-te-apex-deps
  • Loading branch information
ashors1 committed Jul 9, 2024
2 parents 12c1cf6 + f9c3a8b commit 7ef94a1
Show file tree
Hide file tree
Showing 50 changed files with 3,170 additions and 342 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ on:
jobs:
main:
runs-on: ${{ inputs.RUNNER }}
timeout-minutes: ${{ inputs.TIMEOUT }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
Expand All @@ -54,6 +53,7 @@ jobs:
uses: actions/checkout@v4
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
run: |
set +e
(
Expand Down
75 changes: 75 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3488,6 +3488,80 @@ jobs:
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4475,6 +4549,7 @@ jobs:
- L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
Expand Down
8 changes: 0 additions & 8 deletions examples/multimodal/convert_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args):
model = MegatronControlNet.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'kosmos':
model = MegatronKosmosModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'neva':
model = MegatronNevaModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
else:
raise ValueError(f"Unrecognized model_type {args.model_type}.")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,50 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 32
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
check_val_every_n_epoch: null
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_clip
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
ema:
enable: False
decay: 0.9999
validate_original_weights: False
every_n_steps: 1
cpu_offload: False

model:
precision: 32
# specify micro_batch_size, global_batch_size, and model parallelism
Expand All @@ -19,6 +66,9 @@ model:
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue

mcore_gpt: False
transformer_engine: False

vision:
precision: 32
# vision configs
Expand Down Expand Up @@ -135,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: True

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ model:
# numerical results as the naïve method.
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
mcore_gpt: True
transformer_engine: True

vision:
precision: ${trainer.precision}
Expand Down Expand Up @@ -183,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: False

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ trainer:
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 16 # 16, 32, or bf16
precision: 32 # 16, 32, or bf16

model:
restore_from_path: null # Path to a trained ViT .nemo file
Expand Down
Loading

0 comments on commit 7ef94a1

Please sign in to comment.