Fix the hanging problem of init and other problems

aoyulong · Aug 18, 2024 · da50f42 · da50f42
1 parent e46aa71
commit da50f42
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 5 deletions.
diff --git a/megatron/megatron/core/parallel_state.py b/megatron/megatron/core/parallel_state.py
@@ -768,7 +768,7 @@ def generator_wrapper(group_type, **kwargs):
             _POSITION_EMBEDDING_GROUP = group
             _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
 
-    _LAST_RANK_WHEN_USING_PIPELINE = generator_wrapper('pp')[-1][-1] 
+    _LAST_RANK_WHEN_USING_PIPELINE = decoder_rank_generator.get_ranks('pp')[-1][-1] 
 
     # Build the tensor + data parallel groups.
     global _TENSOR_AND_DATA_PARALLEL_GROUP

diff --git a/megatron/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/megatron/core/transformer/custom_layers/transformer_engine.py
@@ -29,7 +29,7 @@
 def get_te_version():
     def get_te_version_str():
         if hasattr(te, '__version__'):
-            return str(te.__version__)
+            return str(te.__version__).split('+')[0]
         else:
             return version("transformer-engine")
 
@@ -895,7 +895,7 @@ def te_checkpoint(
     def get_cpu_offload_context(
         enabled, num_layers, model_layers, activation_offloading, weight_offloading
     ):
-        if _te_version > packaging.version.Version("1.8.0"):
+        if _te_version > packaging.version.Version("1.9.0"):
             context, sync_func = _get_cpu_offload_context(
                 enabled, num_layers, model_layers, activation_offloading, weight_offloading
             )

diff --git a/megatron/megatron/training/initialize.py b/megatron/megatron/training/initialize.py
@@ -263,8 +263,9 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
             'rank': args.rank,
             'timeout': timedelta(minutes=args.distributed_timeout_minutes),
         }
-        if packaging.version.Version(torch.__version__) >= packaging.version.Version("2.3.0"):
-            init_process_group_kwargs['device_id'] = device_id
+        # TODO: @aoyulong the init_process_group will be hanging if the device_id is set 
+        # if packaging.version.Version(torch.__version__) >= packaging.version.Version("2.3.0"):
+        #     init_process_group_kwargs['device_id'] = device_id
 
         torch.distributed.init_process_group(**init_process_group_kwargs)