PaddlePaddle · JZ-LIANG · Nov 29, 2022 · Nov 9, 2022 · Nov 9, 2022 · Nov 9, 2022
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
@@ -1410,6 +1410,9 @@ def naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
 def naive_set_dist_op_attr_for_program_by_mesh(
     new_op, process_mesh, ctx, is_recompute=False
 ):
+    # hack to skip coalesce var for dist attr
+    if not is_recompute:
+        return
     assert process_mesh is not None
 
     new_op_dist_attr = OperatorDistributedAttribute()
@@ -2129,13 +2132,13 @@ def insert_dependencies_for_two_ops(
     block,
     idx,
     prior_op,
-    posterior,
+    posterior_op,
     dist_context,
     is_recompute=False,
     sync=False,
 ):
     """
-    dependency: prior_op should be run before posterior
+    dependency: prior_op should be run before posterior_op
     """
 
     assert (
@@ -2144,15 +2147,15 @@ def insert_dependencies_for_two_ops(
         str(prior_op)
     )
     assert (
-        len(posterior.input_arg_names) >= 1
+        len(posterior_op.input_arg_names) >= 1
     ), "second op of dependency should at least have one input. [{}]".format(
-        str(posterior)
+        str(posterior_op)
     )
     prior_op_mesh = dist_context.get_op_dist_attr_for_program(
         prior_op
     ).process_mesh
     posterior_mesh = dist_context.get_op_dist_attr_for_program(
-        posterior
+        posterior_op
     ).process_mesh
     assert (
         prior_op_mesh == posterior_mesh
@@ -2171,25 +2174,72 @@ def _select_best_depend_var(vars):
         [block.var(name) for name in prior_op.output_arg_names]
     )
     second_var = _select_best_depend_var(
-        [block.var(name) for name in posterior.input_arg_names]
+        [block.var(name) for name in posterior_op.input_arg_names]
     )
 
+    return insert_dependencies_for_two_vars(
+        block,
+        idx,
+        first_var,
+        second_var,
+        dist_context,
+        OpRole.Backward,
+        prior_op_mesh,
+        is_recompute,
+        sync,
+    )
+
+
+def insert_dependencies_for_two_vars(
+    block,
+    idx,
+    prior_var,
+    post_var,
+    dist_context,
+    oprole,
+    process_mesh=None,
+    is_recompute=False,
+    sync=False,
+):
+    """
+    dependency: op that generates prior_var should be run before op that generates post_var
+    """
+    assert block.has_var(prior_var.name)
+    assert block.has_var(post_var.name)
+    if process_mesh is None:
+        process_mesh = dist_context.get_tensor_dist_attr_for_program(
+            post_var
+        ).process_mesh
+    assert process_mesh is not None
+
     depend_op = block._insert_op_without_sync(
         idx,
         type='nop',
         inputs={
-            "X": first_var,
+            "X": prior_var,
         },
-        outputs={"Out": second_var},
+        outputs={"Out": post_var},
     )
     # depend_op.desc.set_type("depend")
-    depend_op._set_attr(OP_ROLE_KEY, OpRole.Backward)
+    depend_op._set_attr(OP_ROLE_KEY, oprole)
     # depend_op.desc.set_input("Dep", [first_var.name])
     # self.desc.set_output(out_proto.name, out_arg_names)
 
     naive_set_dist_op_attr_for_program_by_mesh(
-        depend_op, prior_op_mesh, dist_context, is_recompute
+        depend_op, process_mesh, dist_context, is_recompute
     )
 
     if sync:
         block._sync_with_cpp()
+
+    return depend_op
+
+
+def use_standalone_executor():
+    return os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', None) in [
+        1,
+        '1',
+        True,
+        'True',
+        'true',
+    ]
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -27,8 +27,11 @@
     find_higher_order_backward_op,
     is_loss_grad_op,
     is_optimize_op,
+    is_forward_op,
     ring_id_to_process_group,
     get_var_numel,
+    use_standalone_executor,
+    insert_dependencies_for_two_vars,
 )
 
 # add new optimizers supporting rescale_grad here
@@ -87,16 +90,20 @@ def _apply_single_impl(self, main_program, startup_program, context):
         self.dist_context = self.get_attr("dist_context")
         self.global_rank = int(self.get_attr("global_rank"))
         self.use_sharding = self.get_attr("use_sharding")
+        self.coalesce_prefix = 'coalesce_grad'
+        if use_standalone_executor():
+            self.gradient_sync_stream = "gradient_sync_stream"
 
         with paddle.static.program_guard(main_program, startup_program):
             self._analyze_program()
 
+            # TODO refactor here to first fuse then overlap
             if self.is_data_parallel_applied():
                 self._prune_grad_scaling()
                 self._calc_comm_overlap()
                 grad_group = self._fuse_allreduce()
-
-        # self.summary(grad_group)
+                self._add_dependencies(grad_group)
+                self.summary(grad_group)
 
     def _prune_grad_scaling(self):
 
@@ -284,7 +291,6 @@ def _comms_overlap_calc(self):
         # InterpreterCore has a different logic for overlapping
         # which is different from use_calc_stream
         block = default_main_program().global_block()
-        ops = block.ops
 
         # comm wait calc to finish
         for idx, op in reversed(list(enumerate(block.ops))):
@@ -294,7 +300,6 @@ def _comms_overlap_calc(self):
 
                 op._set_attr('use_calc_stream', False)
                 ring_id = op.attr("ring_id")
-
                 block._insert_op_without_sync(
                     idx,
                     type='c_wait_compute',
@@ -307,8 +312,10 @@ def _comms_overlap_calc(self):
 
     def _calc_wait_comms(self):
 
+        if use_standalone_executor():
+            return
+
         block = default_main_program().global_block()
-        ops = block.ops
 
         # NOTE the naive overlap implement in static hybird parallel only sync comm stream
         # at the end of Backward phase, based on a strong constraint that
@@ -325,7 +332,7 @@ def _calc_wait_comms(self):
             ring_id_to_un_sync_grad_map[group.id] = []
 
         # analyze the where need to sync
-        for i, op in enumerate(ops):
+        for i, op in enumerate(block.ops):
             if is_data_parallel_reduce_op(op):
                 ring_id = op.attr("ring_id")
                 grad_name = op.output_arg_names[0]
@@ -365,6 +372,7 @@ def _calc_wait_comms(self):
                     outputs={'Out': []},
                     attrs={'op_role': OpRole.Backward, 'ring_id': ring_id},
                 )
+        block._sync_with_cpp()
 
     def _could_be_fuse(self):
         # TODO  support gradient fuse higher order gradient.
@@ -404,8 +412,6 @@ def _group_grads(self):
         def collect_group(cur_group, grad_var, ring_id, i):
             if len(cur_group.gradients) == 0:
                 cur_group = None
-            elif len(cur_group.gradients) == 1:
-                grouped_grad_names.remove(cur_group.gradients[0].name)
             else:
                 cur_group.finalize()
                 grad_groups.append(cur_group)
@@ -451,9 +457,16 @@ def _update_program(self, grad_groups):
 
         for i, group in enumerate(grad_groups[::-1]):
 
+            # skip unfused big tensor
+            if len(group.gradients) <= 1:
+                group.coalesce_var = group.gradients[0]
+                continue
+
             # create coalecse tensor
             group.coalesce_var = block.create_var(
-                name=unique_name.generate('coalecse_grad_{}'.format(i)),
+                name=unique_name.generate(
+                    self.coalesce_prefix + '_{}'.format(i)
+                ),
                 dtype=group.dtype,
                 persistable=False,
                 stop_gradient=True,
@@ -497,7 +510,7 @@ def _update_program(self, grad_groups):
                 ), "Unexception: try to remove op {}".format(
                     str(block.ops[idx])
                 )
-                block._remove_op(idx)
+                block._remove_op(idx, False)
 
             # insert coalecse op
             concated_shapes = []
@@ -529,6 +542,141 @@ def _update_program(self, grad_groups):
         block._sync_with_cpp()
         # TODO update dist attr
 
+    def _add_dependencies(self, grad_groups):
+        # NOTE Currently, auto_parallel need to adopt for two executors: Sequential executor (old exe) and Graph based
+        # multiple stream executor(standalone exe). This function just for standalone exe. Refactor here
+        # in future when only one executor stay.
+
+        if not use_standalone_executor() or len(grad_groups) == 0:
+            return
+        block = default_main_program().global_block()
+
+        # Build maps
+        vars_to_coalesce_map = {}
+        coalesce_to_vars_map = {}
+
+        for group in grad_groups:
+            grad_names = []
+            coalesce_name = group.coalesce_var.name
+            for grad in group.gradients:
+                vars_to_coalesce_map[grad.name] = coalesce_name
+                grad_names.append(grad.name)
+            coalesce_to_vars_map[coalesce_name] = grad_names
+
+        # analyze dependencies
+        # Record ONLY the last grad that generated before allreduce
+        # NOTE need to be update when we allow multiple calc stream for backward calc
+        not_sync_coalesces = []
+        prior_allreduce_deps = {}
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_forward_op(op):
+                break
+            if is_optimize_op(op):
+                continue
+
+            if is_data_parallel_reduce_op(op):
+                coalesce_var_name = op.output_arg_names[0]
+
+                # NOTE only add extra deps for fused tensor, other tensor rely on
+                # data flow analysis of executor.
+                if self.coalesce_prefix in coalesce_var_name:
+                    prior_allreduce_deps[coalesce_var_name] = [
+                        idx,
+                        None,
+                        coalesce_var_name,
+                    ]
+                    not_sync_coalesces.append(coalesce_var_name)
+                continue
+
+            for out_name in op.output_arg_names:
+                var_name = vars_to_coalesce_map.get(out_name, None)
+                if var_name in not_sync_coalesces:
+                    prior_allreduce_deps[var_name][1] = out_name
+                    not_sync_coalesces.remove(var_name)
+        assert (
+            len(not_sync_coalesces) == 0
+        ), "Unexception: {} has NOT been add prior Dep before allreduce.".format(
+            not_sync_coalesces
+        )
+
+        # Record ONLY the first grad that used after allreduce
+        # NOTE need to be update when we allow multiple calc stream for backward calc
+        not_sync_coalesces = []
+        post_allreduce_deps = {}
+        for idx, op in enumerate(block.ops):
+            if is_forward_op(op):
+                continue
+
+            if is_data_parallel_reduce_op(op):
+                coalesce_var_name = op.input_arg_names[0]
+                if self.coalesce_prefix in coalesce_var_name:
+                    post_allreduce_deps[coalesce_var_name] = [
+                        None,
+                        coalesce_var_name,
+                        None,
+                    ]
+                    not_sync_coalesces.append(coalesce_var_name)
+                continue
+
+            for out_name in op.input_arg_names:
+                var_name = vars_to_coalesce_map.get(out_name, None)
+                if var_name in not_sync_coalesces:
+                    post_allreduce_deps[var_name][0] = idx
+                    post_allreduce_deps[var_name][2] = out_name
+                    not_sync_coalesces.remove(var_name)
+
+        assert (
+            len(not_sync_coalesces) == 0
+        ), "Unexception: {} has NOT been add post Dep after allreduce.".format(
+            not_sync_coalesces
+        )
+
+        # Update program IR insert dependencise op
+        dep_var_pairs = []
+        for deps in [prior_allreduce_deps, post_allreduce_deps]:
+            for pair in deps.values():
+                dep_var_pairs.append(pair)
+
+        dep_var_pairs.sort(key=lambda x: x[0], reverse=True)
+        for idx, prior_name, post_name in dep_var_pairs:
+            prior_var = block.var(prior_name)
+            post_var = block.var(post_name)
+            depend_op = insert_dependencies_for_two_vars(
+                block,
+                idx,
+                prior_var,
+                post_var,
+                self.dist_context,
+                OpRole.Backward,
+                process_mesh=[
+                    -1
+                ],  # hack to avoid initialize the dist attr for coalesc var
+                is_recompute=False,
+                sync=False,
+            )
+            depend_op.dist_attr.execution_stream = self.gradient_sync_stream
+        block._sync_with_cpp()
+
+        # remove naive synchronization & assign allreduce stream
+        def remove_cond(op):
+            if op.type != "c_wait_compute":
+                return False
+            if len(op.input_arg_names) != 0:
+                return False
+            if len(op.output_arg_names) != 0:
+                return False
+            return True
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_data_parallel_reduce_op(op):
+                op._set_attr('use_calc_stream', True)
+                op.dist_attr.execution_stream = self.gradient_sync_stream
+
+            if remove_cond(op):
+                block._remove_op(idx, sync=False)
+
+        block._sync_with_cpp()
+
     def summary(self, grad_groups=[]):
         # TODO: add logger module
         import logging