Remove explicit config of align_param_gather in mixtral recipe and us…

…e default
guyueh1 · Jan 9, 2025 · 2b8114b · 2b8114b
1 parent 88c16c3
commit 2b8114b
Showing 1 changed file with 1 addition and 11 deletions.
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -210,20 +210,10 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         It may not be suitable for all hardware configurations or use cases.
     """
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
-    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
-    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
-    # each PP stage launches independently as needed.
-
     recipe.trainer.callbacks.extend(
         [
             run.Config(MegatronTokenDropCallback),
-            run.Config(
-                MegatronCommOverlapCallback,
-                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to issue with checkpointing.
-                align_param_gather=True,
-            ),
+            run.Config(MegatronCommOverlapCallback),
         ]
     )
     recipe.trainer.strategy.expert_model_parallel_size = 1