diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index f0cbbd9278345f..bbac67dde4d44b 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -49,7 +49,7 @@ def __init__(self, clip, hcg): self.not_sharding_stage1 = True self._vpp_chunk_num = None self._force_align_vpp_grad_sum_order = distutils.util.strtobool( - os.getenv('FLAGS_force_align_vpp_grad_sum_order', '1') + os.getenv('FLAGS_force_align_vpp_grad_sum_order', '0') ) def _get_vpp_chunk_num(self, params_grads): @@ -168,9 +168,10 @@ def _add_sum_squares(self, sum_squares): @no_grad() def _dygraph_clip(self, params_grads): - chunk_num = self._get_vpp_chunk_num(params_grads) - if chunk_num > 0 and self._force_align_vpp_grad_sum_order: - return self._vpp_dygraph_clip(params_grads, chunk_num) + if self._force_align_vpp_grad_sum_order: + chunk_num = self._get_vpp_chunk_num(params_grads) + if chunk_num > 0: + return self._vpp_dygraph_clip(params_grads, chunk_num) sum_square_dist_fp16 = [] sum_square_dist_bf16 = []