From b761e44c884e06d061a5c1f1034ca19dad5f93ab Mon Sep 17 00:00:00 2001 From: ForFishes <2282912238@qq.com> Date: Wed, 6 Dec 2023 17:49:25 +0800 Subject: [PATCH] add pp bug report --- .../paddle/distributed/fleet/utils/tensor_fusion_helper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 1ca1b8dfbdfac..d93e8e7220fed 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -372,8 +372,9 @@ def add_grad(self, param, use_comm=True): raise ValueError( "The address of the grad/main_grad of the param has been changed during training, " "which is not allowed for dp/sharding overlap with pp. " - "This may be caused by some non-inplace operations on the grad/main_grad. " - "Please use the inplace version of the operations or disable the overlapping." + "This may be caused by some non-inplace operations on the grad/main_grad. Here are some examples: " + "1. The grad/main_grad of the param is changed by other operations, such as: clear_grad, " + "2. Using non-inplace operations on the grad/main_grad, such as: add, sub, mul, div, etc. " ) self._params_step_dict[param.name] += 1