fix ci

InternLM · Jul 9, 2024 · 97cb5d1 · 97cb5d1
1 parent 4b41bb5
commit 97cb5d1
Show file tree

Hide file tree

Showing 6 changed files with 9 additions and 7 deletions.
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
@@ -159,7 +159,6 @@ def __init__(self):
         self.virtual_pipeline_parallel_rank = None
         self._expert_parallel_group_names = []
         self.is_evaluating = False
-        self.recompute_forward_no_comm = False
 
     @property
     def config(self):

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
@@ -294,6 +294,8 @@ def args_sanity_check():
                 "torch.tf32",
             ]
 
+    gpc.config._add_item("recompute_forward_no_comm", False)
+
     if "checkpoint" in model:
         if "checkpoint_tp_no_comm" not in model:
             gpc.config.model._add_item("checkpoint_tp_no_comm", True)

diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
@@ -216,7 +216,7 @@ def _dropout_and_norm_ffn(_residual, _hidden_states):
         hidden_states = self.mlp(hidden_states)
 
         # pad residual
-        if gpc.recompute_forward_no_comm and is_using_sequence_parallel():
+        if gpc.config.recompute_forward_no_comm and is_using_sequence_parallel():
             residual = padding_residual(residual)
 
         return hidden_states + residual

diff --git a/internlm/model/modeling_internlm2.py b/internlm/model/modeling_internlm2.py
@@ -261,7 +261,7 @@ def _dropout_and_norm_ffn(_residual, _hidden_states):
                 hidden_states = self.feed_forward(hidden_states)
 
                 # pad residual
-                if gpc.recompute_forward_no_comm and is_using_sequence_parallel():
+                if gpc.config.recompute_forward_no_comm and is_using_sequence_parallel():
                     residual = padding_residual(residual)
 
             return hidden_states + residual

diff --git a/internlm/model/modules/mlp.py b/internlm/model/modules/mlp.py
@@ -99,7 +99,7 @@ def forward(self, x):
         else:
             fussed_out = self.fused_w1_w3(x)
             w1_o, w3_o = torch.split(fussed_out, fussed_out.shape[-1] // 2, dim=-1)
-        out = self.w2(Silu(w1_o, w3_o), no_communication=gpc.recompute_forward_no_comm)
+        out = self.w2(Silu(w1_o, w3_o), no_communication=gpc.config.recompute_forward_no_comm)
         return out
 
 

diff --git a/internlm/solver/activation_checkpoint.py b/internlm/solver/activation_checkpoint.py
@@ -47,8 +47,8 @@ def recompute_forward_context(args, no_communication):
     handle = None
     try:
         # Set True when entering the context
-        if no_communication:
-            gpc.recompute_forward_no_comm = True
+        if no_communication and hasattr(gpc.config, "recompute_forward_no_comm"):
+            gpc.config.recompute_forward_no_comm = True
             if is_using_sequence_parallel():
                 # overlap all_gather
                 grad_output = args[0]
@@ -58,7 +58,8 @@ def recompute_forward_context(args, no_communication):
         yield
     finally:
         # Set False when exiting the context
-        gpc.recompute_forward_no_comm = False
+        if hasattr(gpc.config, "recompute_forward_no_comm"):
+            gpc.config.recompute_forward_no_comm = False
 
         if handle:
             handle.wait()