[GPU] Fuse more eltwises for horizontally fused FC (#26599)

### Details: - Target pattern: FCs to be fused by horizontal fusing pass and they have Add users which can be regarded as bias add. Here if we fuse the FCs as is, the fused pattern will be fused_fc -> VariadicSplit -> Add so the Adds cannot be fused to the FCs. - This PR sets such Add users as the FC's bias inputs so that the fused FC can handle them as fused bias. ### Tickets: - CVS-151841
openvinotoolkit · Sep 19, 2024 · bdc0110 · bdc0110
1 parent 327f8e2
commit bdc0110
Show file tree

Hide file tree

Showing 2 changed files with 192 additions and 28 deletions.
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp
@@ -13,6 +13,7 @@
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "transformations/utils/utils.hpp"
 #include "intel_gpu/op/placeholder.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 
 namespace ov {
 namespace intel_gpu {
@@ -40,13 +41,12 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
             return std::dynamic_pointer_cast<op::Placeholder>(node);
         };
         // Three FCs connected to the same input
-        const int num_fcs_to_fuse = 3;
+        const int min_num_fcs_to_fuse = 3;
+        const int max_num_fcs_to_fuse = 3;
         const auto& fc = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(output.get_node_shared_ptr());
         const auto& input = fc->get_input_node_shared_ptr(0);
         if (!fc->get_input_partial_shape(0).is_dynamic())
             return false;
-        if (input->get_users().size() < num_fcs_to_fuse)
-            return false;
         size_t user_fc_count = 0;
         int32_t nodes_with_bias = 0;
         int32_t nodes_with_zp = 0;
@@ -67,8 +67,9 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
             }
             user_fc_count++;
         }
-        return (user_fc_count == num_fcs_to_fuse) && (nodes_with_bias == num_fcs_to_fuse || nodes_with_bias == 0) &&
-               (nodes_with_zp == num_fcs_to_fuse || nodes_with_zp == 0);
+        return (user_fc_count >= min_num_fcs_to_fuse) && (user_fc_count <= max_num_fcs_to_fuse) &&
+               (nodes_with_bias == static_cast<int32_t>(user_fc_count) || nodes_with_bias == 0) &&
+               (nodes_with_zp == static_cast<int32_t>(user_fc_count) || nodes_with_zp == 0);
     };
 
     auto target_fc = wrap_type<op::FullyConnectedCompressed>(is_target_pattern);
@@ -78,6 +79,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
         auto m_fc = pattern_map.at(target_fc).get_node_shared_ptr();
         auto input_node = m_fc->get_input_node_shared_ptr(0);
         std::vector<std::shared_ptr<op::FullyConnectedCompressed>> fc_nodes;
+        ov::NodeVector fc_nodes_vec;
         ov::NodeVector weight_nodes;
         ov::NodeVector scale_nodes;
         ov::NodeVector bias_nodes;
@@ -87,6 +89,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
             if (fc_user) {
                 OPENVINO_ASSERT(fc_user->inputs().size() >= 4, "Compressed FC should have at least 4 inputs");
                 fc_nodes.push_back(fc_user);
+                fc_nodes_vec.push_back(fc_user);
                 weight_nodes.push_back(fc_user->get_input_node_shared_ptr(1));
                 if (!std::dynamic_pointer_cast<op::Placeholder>(fc_user->get_input_node_shared_ptr(2)))
                     bias_nodes.push_back(fc_user->get_input_node_shared_ptr(2));
@@ -95,33 +98,109 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
                     zp_nodes.push_back(fc_user->get_input_node_shared_ptr(4));
             }
         }
-        auto weight_dtype = fc_nodes[0]->get_input_element_type(1);
-        auto k_size = fc_nodes[0]->get_input_shape(1)[fc_nodes[0]->get_input_shape(1).size() - 1];
+        // fc weight is already transposed to [N, K]
+        const size_t weight_idx = 1;
+        if (fc_nodes[0]->get_input_shape(weight_idx).size() != 2)
+            return false;
+        const size_t n_axis = 0;
+        const size_t k_axis = 1;
+        auto weight_dtype = fc_nodes[0]->get_input_element_type(weight_idx);
+        auto k_size = fc_nodes[0]->get_input_shape(weight_idx)[k_axis];
         std::vector<int64_t> orig_n_sizes;
         // merge weights, scale, zp
         for (auto fc : fc_nodes) {
-            if (k_size != fc->get_input_shape(1)[fc->get_input_shape(1).size() - 1])
+            if (k_size != fc->get_input_shape(weight_idx)[k_axis])
                 return false;
-            if (weight_dtype != fc->get_input_element_type(1))
+            if (weight_dtype != fc->get_input_element_type(weight_idx))
                 return false;
-            orig_n_sizes.push_back(fc->get_input_shape(1)[fc->get_input_shape(1).size() - 2]);
+            orig_n_sizes.push_back(fc->get_input_shape(weight_idx)[n_axis]);
+        }
+        ov::OutputVector weight_nodes_as_output_vector;
+        for (size_t i = 0; i < weight_nodes.size(); ++i) {
+            weight_nodes_as_output_vector.push_back(weight_nodes[i]);
         }
-        auto weight_nodes_as_output_vector = ov::OutputVector{weight_nodes[0], weight_nodes[1], weight_nodes[2]};
         auto fused_weight = std::make_shared<ov::op::v0::Concat>(weight_nodes_as_output_vector, 0);
-        fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused");
-        ov::copy_runtime_info({weight_nodes[0], weight_nodes[1], weight_nodes[2]}, fused_weight);
+        fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused_weight");
+        ov::copy_runtime_info(weight_nodes, fused_weight);
+
+        ov::OutputVector scales_as_output_vector;
+        for (size_t i = 0; i < scale_nodes.size(); ++i) {
+            scales_as_output_vector.push_back(scale_nodes[i]);
+        }
 
-        auto scale_nodes_as_output_vector = ov::OutputVector{scale_nodes[0], scale_nodes[1], scale_nodes[2]};
-        auto fused_scale = std::make_shared<ov::op::v0::Concat>(scale_nodes_as_output_vector, 0);
-        fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused");
-        ov::copy_runtime_info({scale_nodes[0], scale_nodes[1], scale_nodes[2]}, fused_scale);
+        auto fused_scale = std::make_shared<ov::op::v0::Concat>(scales_as_output_vector, 0);
+        fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused_scale");
+        ov::copy_runtime_info(scale_nodes, fused_scale);
+        // check if all of the fc has a bias user, set it as bias input
+        size_t n_bias_users = 0;
+        for (auto fc : fc_nodes) {
+            if (fc->get_users().size() == 1
+                && fc->get_users()[0]->get_type_info() == ov::opset1::Add::get_type_info_static()
+                && ov::is_type<ov::op::v0::Constant>(fc->get_users()[0]->inputs()[1].get_source_output().get_node())) {
+                    n_bias_users++;
+            }
+        }
 
+        size_t bias_concat_axis = 0;
+        if (bias_nodes.empty() && n_bias_users == fc_nodes.size()) {
+            // Set Add user as bias input to FC
+            for (size_t i = 0; i < fc_nodes.size(); ++i) {
+                auto orig_fc = fc_nodes[i];
+                auto bias_node = orig_fc->get_users()[0];
+                auto bias_const_ptr = orig_fc->get_users()[0]->get_input_node_shared_ptr(1);
+                bias_nodes.push_back(bias_const_ptr);
+            }
+            // Check shape and find axis
+            const auto bias_rank = bias_nodes[0]->get_output_partial_shape(0).size();
+            size_t non_zero_diffs = 0;
+            for (size_t i = 0; i < bias_rank; ++i) {
+                std::unordered_set<size_t> dims;
+                for (size_t j = 0; j < bias_nodes.size(); ++j) {
+                    dims.insert(bias_nodes[j]->get_output_partial_shape(0)[i].get_length());
+                }
+                if (dims.size() > 1) {
+                    bias_concat_axis = i;
+                    non_zero_diffs++;
+                }
+            }
+            if (non_zero_diffs <= 1) {
+                for (size_t i = 0; i < fc_nodes.size(); ++i) {
+                    auto orig_fc = fc_nodes[i];
+                    auto bias_node = orig_fc->get_users()[0];
+                    GPU_DEBUG_TRACE_DETAIL << "Set Add op user " << bias_node->get_friendly_name() << " as the FC "
+                                           << orig_fc->get_friendly_name() << "'s bias input" << std::endl;
+                    auto bias_const = orig_fc->get_users()[0]->input_value(1);
+                    auto orig_users_of_bias_user = bias_node->get_users();
+                    ov::OutputVector fc_inputs = orig_fc->input_values();
+                    fc_inputs[2] = bias_const;
+                    auto new_fc = orig_fc->clone_with_new_inputs(fc_inputs);
+                    new_fc->set_friendly_name(orig_fc->get_friendly_name() + "_with_bias");
+                    ov::copy_runtime_info(orig_fc, new_fc);
+                    for (auto u : orig_users_of_bias_user) {
+                        for (size_t idx = 0; idx < u->inputs().size(); ++idx) {
+                            if (u->get_input_node_shared_ptr(idx) == bias_node) {
+                                u->input(idx).replace_source_output(new_fc->output(0));
+                            }
+                        }
+                    }
+                    fc_nodes[i] = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(new_fc);
+                    bias_node->clear_control_dependencies();
+                    orig_fc->clear_control_dependencies();
+                }
+            } else {
+                // biases cannot be fusable. Not to set users as bias input
+                bias_nodes.clear();
+            }
+        }
         std::shared_ptr<ov::Node> fused_bias;
-        if (bias_nodes.size() == 3) {
-            auto bias_nodes_as_output_vector = ov::OutputVector{bias_nodes[0], bias_nodes[1], bias_nodes[2]};
-            fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, 0);
-            fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused");
-            ov::copy_runtime_info({bias_nodes[0], bias_nodes[1], bias_nodes[2]}, fused_bias);
+        if (bias_nodes.size() == fc_nodes.size()) {
+            ov::OutputVector bias_nodes_as_output_vector;
+            for (size_t i = 0; i < bias_nodes.size(); ++i) {
+                bias_nodes_as_output_vector.push_back(bias_nodes[i]);
+            }
+            fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, bias_concat_axis);
+            fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused_bias");
+            ov::copy_runtime_info(bias_nodes, fused_bias);
         } else {
             fused_bias = std::make_shared<op::Placeholder>();
         }
@@ -161,9 +240,12 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
                         return false;
                 }
             } else {
-                auto zp_nodes_as_output_vector = ov::OutputVector{zp_nodes[0], zp_nodes[1], zp_nodes[2]};
+                ov::OutputVector zp_nodes_as_output_vector;
+                for (size_t i = 0; i < zp_nodes.size(); ++i) {
+                    zp_nodes_as_output_vector.push_back(zp_nodes[i]);
+                }
                 fused_zps = std::make_shared<ov::op::v0::Concat>(zp_nodes_as_output_vector, 0);
-                fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused");
+                fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused_zps");
             }
         }
         // Create new fc with merged weights, bias, scale, zp
@@ -182,16 +264,17 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
                                                                     fused_scale,
                                                                     fc_nodes[0]->get_output_type());
 
-        auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused";
+        auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(fc_nodes.size()) + "FCs";
         new_fc->set_friendly_name(new_fc_name);
-        copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, new_fc);
+        copy_runtime_info(fc_nodes_vec, new_fc);
 
         // Split output and connect to the orig users
         auto split_name = fc_nodes[0]->get_friendly_name() + "_split";
         auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {new_fc->get_output_partial_shape(0).size() - 1});
-        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
+        auto split_size = fc_nodes.size();
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{split_size}, orig_n_sizes);
         auto output_split = std::make_shared<ov::op::v1::VariadicSplit>(new_fc, axis_const, split_const);
-        copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, output_split);
+        copy_runtime_info(fc_nodes_vec, output_split);
         output_split->set_friendly_name(split_name);
         for (size_t i = 0; i < fc_nodes.size(); ++i) {
             auto org_fc = fc_nodes[i];
@@ -204,6 +287,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
             }
             org_fc->clear_control_dependencies();
         }
+        GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
         return true;
     };
 

diff --git a/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp
@@ -17,6 +17,7 @@
 #include "openvino/op/concat.hpp"
 #include "openvino/op/variadic_split.hpp"
 #include "openvino/op/reshape.hpp"
+#include "openvino/op/add.hpp"
 #include "openvino/pass/manager.hpp"
 
 #include <transformations/utils/utils.hpp>
@@ -162,6 +163,85 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_bias_zp) {
         comparator.enable(FunctionsComparator::ATTRIBUTES);
     }
 }
+
+TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
+    std::vector<int64_t> pattern = {7, -1};
+    {
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
+        auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
+        weight1->set_friendly_name("weight1_1");
+        auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
+        weight2->set_friendly_name("weight1_2");
+        auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
+        weight3->set_friendly_name("weight1_3");
+
+        auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+
+        auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
+        auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
+        auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
+        auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
+        fc1->set_friendly_name("fc1");
+        auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
+        auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);
+
+        auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
+        auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);
+
+        auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
+        auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);
+
+        auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
+        auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);
+
+        auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(add1, reshape_pattern, true);
+        auto reshape2 = std::make_shared<ov::op::v1::Reshape>(add2, reshape_pattern, true);
+        auto reshape3 = std::make_shared<ov::op::v1::Reshape>(add3, reshape_pattern, true);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
+        auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
+        model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
+        manager.register_pass<FullyConnectedHorizontalFusion>();
+    }
+    {
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
+        auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
+        weight1->set_friendly_name("weight2_1");
+        auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
+        weight2->set_friendly_name("weight2_2");
+        auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
+        weight3->set_friendly_name("weight2_3");
+        auto weights = ov::OutputVector{weight1, weight2, weight3};
+        auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
+        auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
+        auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
+        auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
+        auto biases = ov::OutputVector{bias1, bias2, bias3};
+        auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
+        auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
+        auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
+        auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
+        auto scales = ov::OutputVector{scale1, scale2, scale3};
+        auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
+        std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
+        auto split = std::make_shared<ov::op::v1::VariadicSplit>(fc_fused, axis_const, split_const);
+        auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
+        auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
+        auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
+        auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
+        model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
 }  // namespace intel_gpu
 }  // namespace test
 }  // namespace ov