Skip to content

Commit

Permalink
Revert "[GPU] Fuse more eltwises for horizontally fused FC (openvinot…
Browse files Browse the repository at this point in the history
…oolkit#26599)"

This reverts commit bdc0110.
  • Loading branch information
songbell committed Sep 26, 2024
1 parent e35479b commit 3a86765
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 192 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "transformations/utils/utils.hpp"
#include "intel_gpu/op/placeholder.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"

namespace ov {
namespace intel_gpu {
Expand Down Expand Up @@ -41,12 +40,13 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
return std::dynamic_pointer_cast<op::Placeholder>(node);
};
// Three FCs connected to the same input
const int min_num_fcs_to_fuse = 3;
const int max_num_fcs_to_fuse = 3;
const int num_fcs_to_fuse = 3;
const auto& fc = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(output.get_node_shared_ptr());
const auto& input = fc->get_input_node_shared_ptr(0);
if (!fc->get_input_partial_shape(0).is_dynamic())
return false;
if (input->get_users().size() < num_fcs_to_fuse)
return false;
size_t user_fc_count = 0;
int32_t nodes_with_bias = 0;
int32_t nodes_with_zp = 0;
Expand All @@ -67,9 +67,8 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
}
user_fc_count++;
}
return (user_fc_count >= min_num_fcs_to_fuse) && (user_fc_count <= max_num_fcs_to_fuse) &&
(nodes_with_bias == static_cast<int32_t>(user_fc_count) || nodes_with_bias == 0) &&
(nodes_with_zp == static_cast<int32_t>(user_fc_count) || nodes_with_zp == 0);
return (user_fc_count == num_fcs_to_fuse) && (nodes_with_bias == num_fcs_to_fuse || nodes_with_bias == 0) &&
(nodes_with_zp == num_fcs_to_fuse || nodes_with_zp == 0);
};

auto target_fc = wrap_type<op::FullyConnectedCompressed>(is_target_pattern);
Expand All @@ -79,7 +78,6 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
auto m_fc = pattern_map.at(target_fc).get_node_shared_ptr();
auto input_node = m_fc->get_input_node_shared_ptr(0);
std::vector<std::shared_ptr<op::FullyConnectedCompressed>> fc_nodes;
ov::NodeVector fc_nodes_vec;
ov::NodeVector weight_nodes;
ov::NodeVector scale_nodes;
ov::NodeVector bias_nodes;
Expand All @@ -89,7 +87,6 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
if (fc_user) {
OPENVINO_ASSERT(fc_user->inputs().size() >= 4, "Compressed FC should have at least 4 inputs");
fc_nodes.push_back(fc_user);
fc_nodes_vec.push_back(fc_user);
weight_nodes.push_back(fc_user->get_input_node_shared_ptr(1));
if (!std::dynamic_pointer_cast<op::Placeholder>(fc_user->get_input_node_shared_ptr(2)))
bias_nodes.push_back(fc_user->get_input_node_shared_ptr(2));
Expand All @@ -98,109 +95,33 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
zp_nodes.push_back(fc_user->get_input_node_shared_ptr(4));
}
}
// fc weight is already transposed to [N, K]
const size_t weight_idx = 1;
if (fc_nodes[0]->get_input_shape(weight_idx).size() != 2)
return false;
const size_t n_axis = 0;
const size_t k_axis = 1;
auto weight_dtype = fc_nodes[0]->get_input_element_type(weight_idx);
auto k_size = fc_nodes[0]->get_input_shape(weight_idx)[k_axis];
auto weight_dtype = fc_nodes[0]->get_input_element_type(1);
auto k_size = fc_nodes[0]->get_input_shape(1)[fc_nodes[0]->get_input_shape(1).size() - 1];
std::vector<int64_t> orig_n_sizes;
// merge weights, scale, zp
for (auto fc : fc_nodes) {
if (k_size != fc->get_input_shape(weight_idx)[k_axis])
if (k_size != fc->get_input_shape(1)[fc->get_input_shape(1).size() - 1])
return false;
if (weight_dtype != fc->get_input_element_type(weight_idx))
if (weight_dtype != fc->get_input_element_type(1))
return false;
orig_n_sizes.push_back(fc->get_input_shape(weight_idx)[n_axis]);
}
ov::OutputVector weight_nodes_as_output_vector;
for (size_t i = 0; i < weight_nodes.size(); ++i) {
weight_nodes_as_output_vector.push_back(weight_nodes[i]);
orig_n_sizes.push_back(fc->get_input_shape(1)[fc->get_input_shape(1).size() - 2]);
}
auto weight_nodes_as_output_vector = ov::OutputVector{weight_nodes[0], weight_nodes[1], weight_nodes[2]};
auto fused_weight = std::make_shared<ov::op::v0::Concat>(weight_nodes_as_output_vector, 0);
fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused_weight");
ov::copy_runtime_info(weight_nodes, fused_weight);

ov::OutputVector scales_as_output_vector;
for (size_t i = 0; i < scale_nodes.size(); ++i) {
scales_as_output_vector.push_back(scale_nodes[i]);
}
fused_weight->set_friendly_name(weight_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({weight_nodes[0], weight_nodes[1], weight_nodes[2]}, fused_weight);

auto fused_scale = std::make_shared<ov::op::v0::Concat>(scales_as_output_vector, 0);
fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused_scale");
ov::copy_runtime_info(scale_nodes, fused_scale);
// check if all of the fc has a bias user, set it as bias input
size_t n_bias_users = 0;
for (auto fc : fc_nodes) {
if (fc->get_users().size() == 1
&& fc->get_users()[0]->get_type_info() == ov::opset1::Add::get_type_info_static()
&& ov::is_type<ov::op::v0::Constant>(fc->get_users()[0]->inputs()[1].get_source_output().get_node())) {
n_bias_users++;
}
}
auto scale_nodes_as_output_vector = ov::OutputVector{scale_nodes[0], scale_nodes[1], scale_nodes[2]};
auto fused_scale = std::make_shared<ov::op::v0::Concat>(scale_nodes_as_output_vector, 0);
fused_scale->set_friendly_name(scale_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({scale_nodes[0], scale_nodes[1], scale_nodes[2]}, fused_scale);

size_t bias_concat_axis = 0;
if (bias_nodes.empty() && n_bias_users == fc_nodes.size()) {
// Set Add user as bias input to FC
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto orig_fc = fc_nodes[i];
auto bias_node = orig_fc->get_users()[0];
auto bias_const_ptr = orig_fc->get_users()[0]->get_input_node_shared_ptr(1);
bias_nodes.push_back(bias_const_ptr);
}
// Check shape and find axis
const auto bias_rank = bias_nodes[0]->get_output_partial_shape(0).size();
size_t non_zero_diffs = 0;
for (size_t i = 0; i < bias_rank; ++i) {
std::unordered_set<size_t> dims;
for (size_t j = 0; j < bias_nodes.size(); ++j) {
dims.insert(bias_nodes[j]->get_output_partial_shape(0)[i].get_length());
}
if (dims.size() > 1) {
bias_concat_axis = i;
non_zero_diffs++;
}
}
if (non_zero_diffs <= 1) {
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto orig_fc = fc_nodes[i];
auto bias_node = orig_fc->get_users()[0];
GPU_DEBUG_TRACE_DETAIL << "Set Add op user " << bias_node->get_friendly_name() << " as the FC "
<< orig_fc->get_friendly_name() << "'s bias input" << std::endl;
auto bias_const = orig_fc->get_users()[0]->input_value(1);
auto orig_users_of_bias_user = bias_node->get_users();
ov::OutputVector fc_inputs = orig_fc->input_values();
fc_inputs[2] = bias_const;
auto new_fc = orig_fc->clone_with_new_inputs(fc_inputs);
new_fc->set_friendly_name(orig_fc->get_friendly_name() + "_with_bias");
ov::copy_runtime_info(orig_fc, new_fc);
for (auto u : orig_users_of_bias_user) {
for (size_t idx = 0; idx < u->inputs().size(); ++idx) {
if (u->get_input_node_shared_ptr(idx) == bias_node) {
u->input(idx).replace_source_output(new_fc->output(0));
}
}
}
fc_nodes[i] = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(new_fc);
bias_node->clear_control_dependencies();
orig_fc->clear_control_dependencies();
}
} else {
// biases cannot be fusable. Not to set users as bias input
bias_nodes.clear();
}
}
std::shared_ptr<ov::Node> fused_bias;
if (bias_nodes.size() == fc_nodes.size()) {
ov::OutputVector bias_nodes_as_output_vector;
for (size_t i = 0; i < bias_nodes.size(); ++i) {
bias_nodes_as_output_vector.push_back(bias_nodes[i]);
}
fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, bias_concat_axis);
fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused_bias");
ov::copy_runtime_info(bias_nodes, fused_bias);
if (bias_nodes.size() == 3) {
auto bias_nodes_as_output_vector = ov::OutputVector{bias_nodes[0], bias_nodes[1], bias_nodes[2]};
fused_bias = std::make_shared<ov::op::v0::Concat>(bias_nodes_as_output_vector, 0);
fused_bias->set_friendly_name(bias_nodes[0]->get_friendly_name() + "_fused");
ov::copy_runtime_info({bias_nodes[0], bias_nodes[1], bias_nodes[2]}, fused_bias);
} else {
fused_bias = std::make_shared<op::Placeholder>();
}
Expand Down Expand Up @@ -240,12 +161,9 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
return false;
}
} else {
ov::OutputVector zp_nodes_as_output_vector;
for (size_t i = 0; i < zp_nodes.size(); ++i) {
zp_nodes_as_output_vector.push_back(zp_nodes[i]);
}
auto zp_nodes_as_output_vector = ov::OutputVector{zp_nodes[0], zp_nodes[1], zp_nodes[2]};
fused_zps = std::make_shared<ov::op::v0::Concat>(zp_nodes_as_output_vector, 0);
fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused_zps");
fused_zps->set_friendly_name(zp_nodes[0]->get_friendly_name() + "_fused");
}
}
// Create new fc with merged weights, bias, scale, zp
Expand All @@ -264,17 +182,16 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
fused_scale,
fc_nodes[0]->get_output_type());

auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(fc_nodes.size()) + "FCs";
auto new_fc_name = fc_nodes[0]->get_friendly_name() + "_fused";
new_fc->set_friendly_name(new_fc_name);
copy_runtime_info(fc_nodes_vec, new_fc);
copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, new_fc);

// Split output and connect to the orig users
auto split_name = fc_nodes[0]->get_friendly_name() + "_split";
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {new_fc->get_output_partial_shape(0).size() - 1});
auto split_size = fc_nodes.size();
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{split_size}, orig_n_sizes);
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto output_split = std::make_shared<ov::op::v1::VariadicSplit>(new_fc, axis_const, split_const);
copy_runtime_info(fc_nodes_vec, output_split);
copy_runtime_info({fc_nodes[0], fc_nodes[1], fc_nodes[2]}, output_split);
output_split->set_friendly_name(split_name);
for (size_t i = 0; i < fc_nodes.size(); ++i) {
auto org_fc = fc_nodes[i];
Expand All @@ -287,7 +204,6 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
}
org_fc->clear_control_dependencies();
}
GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
return true;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "openvino/op/concat.hpp"
#include "openvino/op/variadic_split.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/add.hpp"
#include "openvino/pass/manager.hpp"

#include <transformations/utils/utils.hpp>
Expand Down Expand Up @@ -163,85 +162,6 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_bias_zp) {
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}

TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
std::vector<int64_t> pattern = {7, -1};
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight1_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight1_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight1_3");

auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
fc1->set_friendly_name("fc1");
auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);

auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);

auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);

auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);

auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(add1, reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(add2, reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(add3, reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
manager.register_pass<FullyConnectedHorizontalFusion>();
}
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight2_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight2_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight2_3");
auto weights = ov::OutputVector{weight1, weight2, weight3};
auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto biases = ov::OutputVector{bias1, bias2, bias3};
auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto scales = ov::OutputVector{scale1, scale2, scale3};
auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto split = std::make_shared<ov::op::v1::VariadicSplit>(fc_fused, axis_const, split_const);
auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}
} // namespace intel_gpu
} // namespace test
} // namespace ov

0 comments on commit 3a86765

Please sign in to comment.