Skip to content

Commit

Permalink
[GPU] Fuse more eltwises for horizontally fused FC (#26599)
Browse files Browse the repository at this point in the history
### Details:
- Target pattern: FCs to be fused by horizontal fusing pass and they
have Add users which can be regarded as bias add. Here if we fuse the
FCs as is, the fused pattern will be fused_fc -> VariadicSplit -> Add so
the Adds cannot be fused to the FCs.
- This PR sets such Add users as the FC's bias inputs so that the fused
FC can handle them as fused bias.

### Tickets:
 - CVS-151841
  • Loading branch information
yeonbok authored Sep 19, 2024
1 parent 327f8e2 commit bdc0110
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "transformations/utils/utils.hpp"
#include "intel_gpu/op/placeholder.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"

namespace ov {
namespace intel_gpu {
Expand Down Expand Up @@ -40,13 +41,12 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
return std::dynamic_pointer_cast<op::Placeholder>(node);
};
// Three FCs connected to the same input
const int num_fcs_to_fuse = 3;
const int min_num_fcs_to_fuse = 3;
const int max_num_fcs_to_fuse = 3;
const auto& fc = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(output.get_node_shared_ptr());
const auto& input = fc->get_input_node_shared_ptr(0);
if (!fc->get_input_partial_shape(0).is_dynamic())
return false;
if (input->get_users().size() < num_fcs_to_fuse)
return false;
size_t user_fc_count = 0;
int32_t nodes_with_bias = 0;
int32_t nodes_with_zp = 0;
Expand All @@ -67,8 +67,9 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
}
user_fc_count++;
}
return (user_fc_count == num_fcs_to_fuse) && (nodes_with_bias == num_fcs_to_fuse || nodes_with_bias == 0) &&
(nodes_with_zp == num_fcs_to_fuse || nodes_with_zp == 0);
return (user_fc_count >= min_num_fcs_to_fuse) && (user_fc_count <= max_num_fcs_to_fuse) &&
(nodes_with_bias == static_cast<int32_t>(user_fc_count) || nodes_with_bias == 0) &&
(nodes_with_zp == static_cast<int32_t>(user_fc_count) || nodes_with_zp == 0);
};

auto target_fc = wrap_type<op::FullyConnectedCompressed>(is_target_pattern);
Expand All @@ -78,6 +79,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
auto m_fc = pattern_map.at(target_fc).get_node_shared_ptr();
auto input_node = m_fc->get_input_node_shared_ptr(0);
std::vector<std::shared_ptr<op::FullyConnectedCompressed>> fc_nodes;
ov::NodeVector fc_nodes_vec;
ov::NodeVector weight_nodes;
ov::NodeVector scale_nodes;
ov::NodeVector bias_nodes;
Expand All @@ -87,6 +89,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
if (fc_user) {
OPENVINO_ASSERT(fc_user->inputs().size() >= 4, "Compressed FC should have at least 4 inputs");
fc_nodes.push_back(fc_user);
fc_nodes_vec.push_back(fc_user);
weight_nodes.push_back(fc_user->get_input_node_shared_ptr(1));
if (!std::dynamic_pointer_cast<op::Placeholder>(fc_user->get_input_node_shared_ptr(2)))
bias_nodes.push_back(fc_user->get_input_node_shared_ptr(2));
Expand All @@ -95,33 +98,109 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
zp_nodes.push_back(fc_user->get_input_node_shared_ptr(4));
}
}
auto weight_dtype = fc_nodes[0]->get_input_element_type(1);
auto k_size = fc_nodes[0]->get_input_shape(1)[fc_nodes[0]->get_input_shape(1).size() - 1];
// fc weight is already transposed to [N, K]
const size_t weight_idx = 1;
if (fc_nodes[0]->get_input_shape(weight_idx).size() != 2)
return false;
const size_t n_axis = 0;
const size_t k_axis = 1;
auto weight_dtype = fc_nodes[0]->get_input_element_type(weight_idx);
auto k_size = fc_nodes[0]->get_input_shape(weight_idx)[k_axis];
std::vector<int64_t> orig_n_sizes;
// merge weights, scale, zp
for (auto fc : fc_nodes) {
if (k_size != fc->get_input_shape(1)[fc->get_input_shape(1).size() - 1])
if (k_size != fc->get_input_shape(weight_idx)[k_axis])
return false;
if (weight_dtype != fc->get_input_element_type(1))
if (weight_dtype != fc->get_input_element_type(weight_idx))
return false;
orig_n_sizes.push_back(fc->get_input_shape(1)[fc->get_input_shape(1).size() - 2]);
orig_n_sizes.push_back(fc->get_input_shape(weight_idx)[n_axis]);
}
ov::OutputVector weight_nodes_as_output_vector;
for (size_t i = 0; i < weight_nodes.size(); ++i) {
weight_nodes_as_output_vector.push_back(weight_nodes[i]);
}
auto weight_nodes_as_output_vector = ov::OutputVector{weight_nodes[0], weight_nodes[1], weight_nodes[2]};
auto fused_weight = std::make_shared<ov::op::v0::Concat>(weight_nodes_as_output_vector, 0);
fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({weight_nodes[0], weight_nodes[1], weight_nodes[2]}, fused_weight);
fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused_weight");
ov::copy_runtime_info(weight_nodes, fused_weight);

ov::OutputVector scales_as_output_vector;
for (size_t i = 0; i < scale_nodes.size(); ++i) {
scales_as_output_vector.push_back(scale_nodes[i]);
}

auto scale_nodes_as_output_vector = ov::OutputVector{scale_nodes[0], scale_nodes[1], scale_nodes[2]};
auto fused_scale = std::make_shared<ov::op::v0::Concat>(scale_nodes_as_output_vector, 0);
fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({scale_nodes[0], scale_nodes[1], scale_nodes[2]}, fused_scale);
auto fused_scale = std::make_shared<ov::op::v0::Concat>(scales_as_output_vector, 0);
fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused_scale");
ov::copy_runtime_info(scale_nodes, fused_scale);
// check if all of the fc has a bias user, set it as bias input
size_t n_bias_users = 0;
for (auto fc : fc_nodes) {
if (fc->get_users().size() == 1
&& fc->get_users()[0]->get_type_info() == ov::opset1::Add::get_type_info_static()
&& ov::is_type<ov::op::v0::Constant>(fc->get_users()[0]->inputs()[1].get_source_output().get_node())) {
n_bias_users++;
}
}

size_t bias_concat_axis = 0;
if (bias_nodes.empty() && n_bias_users == fc_nodes.size()) {
// Set Add user as bias input to FC
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto orig_fc = fc_nodes[i];
auto bias_node = orig_fc->get_users()[0];
auto bias_const_ptr = orig_fc->get_users()[0]->get_input_node_shared_ptr(1);
bias_nodes.push_back(bias_const_ptr);
}
// Check shape and find axis
const auto bias_rank = bias_nodes[0]->get_output_partial_shape(0).size();
size_t non_zero_diffs = 0;
for (size_t i = 0; i < bias_rank; ++i) {
std::unordered_set<size_t> dims;
for (size_t j = 0; j < bias_nodes.size(); ++j) {
dims.insert(bias_nodes[j]->get_output_partial_shape(0)[i].get_length());
}
if (dims.size() > 1) {
bias_concat_axis = i;
non_zero_diffs++;
}
}
if (non_zero_diffs <= 1) {
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto orig_fc = fc_nodes[i];
auto bias_node = orig_fc->get_users()[0];
GPU_DEBUG_TRACE_DETAIL << "Set Add op user " << bias_node->get_friendly_name() << " as the FC "
<< orig_fc->get_friendly_name() << "'s bias input" << std::endl;
auto bias_const = orig_fc->get_users()[0]->input_value(1);
auto orig_users_of_bias_user = bias_node->get_users();
ov::OutputVector fc_inputs = orig_fc->input_values();
fc_inputs[2] = bias_const;
auto new_fc = orig_fc->clone_with_new_inputs(fc_inputs);
new_fc->set_friendly_name(orig_fc->get_friendly_name() + "_with_bias");
ov::copy_runtime_info(orig_fc, new_fc);
for (auto u : orig_users_of_bias_user) {
for (size_t idx = 0; idx < u->inputs().size(); ++idx) {
if (u->get_input_node_shared_ptr(idx) == bias_node) {
u->input(idx).replace_source_output(new_fc->output(0));
}
}
}
fc_nodes[i] = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(new_fc);
bias_node->clear_control_dependencies();
orig_fc->clear_control_dependencies();
}
} else {
// biases cannot be fusable. Not to set users as bias input
bias_nodes.clear();
}
}
std::shared_ptr<ov::Node> fused_bias;
if (bias_nodes.size() == 3) {
auto bias_nodes_as_output_vector = ov::OutputVector{bias_nodes[0], bias_nodes[1], bias_nodes[2]};
fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, 0);
fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({bias_nodes[0], bias_nodes[1], bias_nodes[2]}, fused_bias);
if (bias_nodes.size() == fc_nodes.size()) {
ov::OutputVector bias_nodes_as_output_vector;
for (size_t i = 0; i < bias_nodes.size(); ++i) {
bias_nodes_as_output_vector.push_back(bias_nodes[i]);
}
fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, bias_concat_axis);
fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused_bias");
ov::copy_runtime_info(bias_nodes, fused_bias);
} else {
fused_bias = std::make_shared<op::Placeholder>();
}
Expand Down Expand Up @@ -161,9 +240,12 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
return false;
}
} else {
auto zp_nodes_as_output_vector = ov::OutputVector{zp_nodes[0], zp_nodes[1], zp_nodes[2]};
ov::OutputVector zp_nodes_as_output_vector;
for (size_t i = 0; i < zp_nodes.size(); ++i) {
zp_nodes_as_output_vector.push_back(zp_nodes[i]);
}
fused_zps = std::make_shared<ov::op::v0::Concat>(zp_nodes_as_output_vector, 0);
fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused");
fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused_zps");
}
}
// Create new fc with merged weights, bias, scale, zp
Expand All @@ -182,16 +264,17 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
fused_scale,
fc_nodes[0]->get_output_type());

auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused";
auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(fc_nodes.size()) + "FCs";
new_fc->set_friendly_name(new_fc_name);
copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, new_fc);
copy_runtime_info(fc_nodes_vec, new_fc);

// Split output and connect to the orig users
auto split_name = fc_nodes[0]->get_friendly_name() + "_split";
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {new_fc->get_output_partial_shape(0).size() - 1});
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto split_size = fc_nodes.size();
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{split_size}, orig_n_sizes);
auto output_split = std::make_shared<ov::op::v1::VariadicSplit>(new_fc, axis_const, split_const);
copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, output_split);
copy_runtime_info(fc_nodes_vec, output_split);
output_split->set_friendly_name(split_name);
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto org_fc = fc_nodes[i];
Expand All @@ -204,6 +287,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
}
org_fc->clear_control_dependencies();
}
GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
return true;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "openvino/op/concat.hpp"
#include "openvino/op/variadic_split.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/add.hpp"
#include "openvino/pass/manager.hpp"

#include <transformations/utils/utils.hpp>
Expand Down Expand Up @@ -162,6 +163,85 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_bias_zp) {
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}

TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
std::vector<int64_t> pattern = {7, -1};
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight1_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight1_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight1_3");

auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
fc1->set_friendly_name("fc1");
auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);

auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);

auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);

auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);

auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(add1, reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(add2, reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(add3, reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
manager.register_pass<FullyConnectedHorizontalFusion>();
}
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight2_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight2_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight2_3");
auto weights = ov::OutputVector{weight1, weight2, weight3};
auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto biases = ov::OutputVector{bias1, bias2, bias3};
auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto scales = ov::OutputVector{scale1, scale2, scale3};
auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto split = std::make_shared<ov::op::v1::VariadicSplit>(fc_fused, axis_const, split_const);
auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}
} // namespace intel_gpu
} // namespace test
} // namespace ov

0 comments on commit bdc0110

Please sign in to comment.