[NVIDIA] Adds tests for fuse matmul with add transformation. Fix bug …

…when input node have several output ports. (#690)
openvinotoolkit · Jul 24, 2023 · 8e60111 · 8e60111
1 parent 149de40
commit 8e60111
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 88 deletions.
diff --git a/modules/nvidia_plugin/src/transformer/cuda_fullyconnected_transformation.cpp b/modules/nvidia_plugin/src/transformer/cuda_fullyconnected_transformation.cpp
diff --git a/modules/nvidia_plugin/src/transformer/cuda_graph_transformer.cpp b/modules/nvidia_plugin/src/transformer/cuda_graph_transformer.cpp
@@ -27,7 +27,7 @@
 
 #include "bidirectional_lstm_sequence_composition.hpp"
 #include "concat_transformation.hpp"
-#include "cuda_fullyconnected_transformation.hpp"
+#include "fuse_matmul_add.hpp"
 #include "matmul_transformations.hpp"
 #include "noop_broadcast_transformation.hpp"
 #include "remove_duplicated_results_transformation.hpp"

diff --git a/modules/nvidia_plugin/src/transformer/fuse_matmul_add.cpp b/modules/nvidia_plugin/src/transformer/fuse_matmul_add.cpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/cc/pass/itt.hpp"
+#include "fuse_matmul_add.hpp"
+
+#include <exec_graph_info.hpp>
+#include "openvino/core/rt_info.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include <openvino/op/add.hpp>
+#include <openvino/op/matmul.hpp>
+#include <ops/matmul.hpp>
+
+using namespace ov::pass::pattern;
+
+namespace {
+std::pair<std::shared_ptr<ov::op::v0::MatMul>, std::shared_ptr<ov::op::v0::Constant>> get_matmul_constant_nodes(const std::shared_ptr<ov::Node>& add_node) {
+    if (std::dynamic_pointer_cast<ov::op::v0::Constant>(add_node->get_input_node_shared_ptr(1))) {
+        return {std::dynamic_pointer_cast<ov::op::v0::MatMul>(add_node->get_input_node_shared_ptr(0)),
+                std::dynamic_pointer_cast<ov::op::v0::Constant>(add_node->get_input_node_shared_ptr(1))};
+    } else if (std::dynamic_pointer_cast<ov::op::v0::Constant>(add_node->get_input_node_shared_ptr(0))) {
+        return {std::dynamic_pointer_cast<ov::op::v0::MatMul>(add_node->get_input_node_shared_ptr(1)),
+                std::dynamic_pointer_cast<ov::op::v0::Constant>(add_node->get_input_node_shared_ptr(0))};
+    }
+    return {nullptr, nullptr};
+}
+
+bool is_add_to_be_fused(const ov::Output<ov::Node>& output) {
+    auto add_node = std::dynamic_pointer_cast<ov::op::v1::Add>(output.get_node_shared_ptr());
+    if (!add_node) {
+        return false;
+    }
+    std::shared_ptr<ov::op::v0::MatMul> matmul_node;
+    std::shared_ptr<ov::op::v0::Constant> constant_node;
+    std::tie(matmul_node, constant_node) = get_matmul_constant_nodes(add_node);
+    if (!matmul_node || !constant_node) {
+        return false;
+    }
+
+    auto matrix_A_shape = matmul_node->get_input_shape(0);
+    auto matrix_B_shape = matmul_node->get_input_shape(1);
+    const auto matrix_shape = matmul_node->get_output_shape(0);
+    ov::nvidia_gpu::MatMulOp::BroadcastToMatrix(matrix_A_shape);
+    ov::nvidia_gpu::MatMulOp::BroadcastToMatrix(matrix_B_shape);
+    const auto matmul_batch = std::max(ov::nvidia_gpu::MatMulOp::GetMatrixNumBatches(matrix_A_shape),
+                                       ov::nvidia_gpu::MatMulOp::GetMatrixNumBatches(matrix_B_shape));
+
+    auto const_shape = constant_node->get_output_shape(0);
+    ov::nvidia_gpu::MatMulOp::BroadcastToMatrix(const_shape);
+    const auto const_batch = ov::nvidia_gpu::MatMulOp::GetMatrixNumBatches(const_shape);
+    const auto const_shape_size = ov::shape_size(const_shape);
+    const auto matrix_shape_size = ov::shape_size(matrix_shape);
+    const auto num_auto_const_batch = matrix_shape_size / const_shape_size;
+    const auto matmul_shape_dividable = matrix_shape_size % const_shape_size;
+    if (matmul_batch < const_batch || matmul_shape_dividable != 0 || num_auto_const_batch > 1) {
+        return false;
+    }
+    return true;
+}
+} // namespace
+
+namespace ov::nvidia_gpu::pass {
+bool fuse_matmul_and_add(Matcher &m) {
+    // Decompose Divide into Multiply with Power operations
+    auto add_node = std::dynamic_pointer_cast<ov::op::v1::Add>(m.get_match_root());
+    auto consumers = add_node->output(0).get_target_inputs();
+    std::shared_ptr<ov::op::v0::MatMul> matmul_node;
+    std::shared_ptr<ov::op::v0::Constant> constant_node;
+    std::tie(matmul_node, constant_node) = get_matmul_constant_nodes(add_node);
+    const auto fully_connected_node =
+        std::make_shared<ov::nvidia_gpu::nodes::FullyConnected>(matmul_node->get_input_source_output(0),
+                                                                matmul_node->get_input_source_output(1),
+                                                                constant_node,
+                                                                matmul_node->get_transpose_a(),
+                                                                matmul_node->get_transpose_b());
+    fully_connected_node->set_friendly_name(add_node->get_friendly_name());
+    ov::copy_runtime_info({matmul_node, add_node}, fully_connected_node);
+
+    const std::string original_layers = matmul_node->get_friendly_name() + "," + add_node->get_friendly_name();
+    fully_connected_node->get_rt_info()[ExecGraphInfoSerialization::ORIGINAL_NAMES] = original_layers;
+
+    for (auto input : consumers) {
+        input.replace_source_output(fully_connected_node);
+    }
+    return true;
+}
+
+FullyConnectedTransformation::FullyConnectedTransformation() {
+    MATCHER_SCOPE(FullyConnectedTransformation);
+    auto matmul = wrap_type<ov::op::v0::MatMul>(consumers_count(1));
+    auto bias = wrap_type<ov::op::v0::Constant>();
+    auto add0 = wrap_type<ov::op::v1::Add>({matmul, bias}, is_add_to_be_fused);
+    auto add1 = wrap_type<ov::op::v1::Add>({bias, matmul}, is_add_to_be_fused);
+    auto result = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{add0, add1});
+
+    matcher_pass_callback callback = [](Matcher &m) { return fuse_matmul_and_add(m); };
+
+    auto m = std::make_shared<Matcher>(result, matcher_name);
+    register_matcher(m, callback);
+}
+
+}  // namespace ov::nvidia_gpu::pass
diff --git a/...er/cuda_fullyconnected_transformation.hpp → ...lugin/src/transformer/fuse_matmul_add.hpp b/...er/cuda_fullyconnected_transformation.hpp → ...lugin/src/transformer/fuse_matmul_add.hpp
diff --git a/modules/nvidia_plugin/tests/unit/transformations/fuse_matmul_add.cpp b/modules/nvidia_plugin/tests/unit/transformations/fuse_matmul_add.cpp
@@ -0,0 +1,171 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <tuple>
+
+#include "transformer/fuse_matmul_add.hpp"
+#include "transformer/nodes/fully_connected.hpp"
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/pass/manager.hpp"
+#include "transformations/init_node_info.hpp"
+#include "transformations/utils/utils.hpp"
+
+using ov::nvidia_gpu::nodes::FullyConnected;
+using namespace ov;
+using namespace std;
+
+namespace testing {
+
+TEST(fuse_matmul_add, parameters_matmul_add_constant) {
+    shared_ptr<ov::Model> model, model_ref;
+    {
+        auto input0 = make_shared<op::v0::Parameter>(element::f32, Shape{1, 512});
+        auto input1 = make_shared<op::v0::Parameter>(element::f32, Shape{1024, 512});
+        auto matmul = make_shared<op::v0::MatMul>(input0, input1, false, true);
+        auto const_node = op::v0::Constant::create(element::f32, Shape{1, 1024}, {1});
+        auto add = make_shared<op::v1::Add>(matmul, const_node);
+        model = make_shared<Model>(add, ParameterVector{input0, input1});
+
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::InitNodeInfo>();
+        pass_manager.register_pass<nvidia_gpu::pass::FullyConnectedTransformation>();
+        pass_manager.run_passes(model);
+
+        ASSERT_EQ(count_ops_of_type<op::v0::MatMul>(model), 0);
+    }
+    {
+        auto input0 = make_shared<op::v0::Parameter>(element::f32, Shape{1, 512});
+        auto input1 = make_shared<op::v0::Parameter>(element::f32, Shape{1024, 512});
+        auto const_node = op::v0::Constant::create(element::f32, Shape{1, 1024}, {1});
+        auto fc = make_shared<FullyConnected>(input0, input1, const_node, false, true);
+        model_ref = make_shared<Model>(fc, ParameterVector{input0, input1});
+    }
+
+    auto res = compare_functions(model, model_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(fuse_matmul_add, parameters_matmul_add_parameter_fail) {
+    shared_ptr<ov::Model> model, model_ref;
+    {
+        auto input0 = make_shared<op::v0::Parameter>(element::f32, Shape{1, 512});
+        auto input1 = make_shared<op::v0::Parameter>(element::f32, Shape{512, 1024});
+        auto matmul = make_shared<op::v0::MatMul>(input0, input1, false, false);
+        auto input3 = make_shared<op::v0::Parameter>(element::f32, Shape{1, 1024});
+        auto add = make_shared<op::v1::Add>(matmul, input3);
+        model = make_shared<Model>(add, ParameterVector{input0, input1, input3});
+    }
+    model_ref = model->clone();
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::InitNodeInfo>();
+    pass_manager.register_pass<nvidia_gpu::pass::FullyConnectedTransformation>();
+    pass_manager.run_passes(model);
+
+    ASSERT_EQ(count_ops_of_type<op::v0::MatMul>(model), 1);
+
+    auto res = compare_functions(model, model_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(fuse_matmul_add, parameter_constant_matmul_add_constant) {
+    shared_ptr<ov::Model> model, model_ref;
+    {
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{512, 1});
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{1024, 512}, {1});
+        auto matmul = make_shared<op::v0::MatMul>(input, const_node0, true, true);
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 1024}, {2});
+        auto add = make_shared<op::v1::Add>(matmul, const_node1);
+        model = make_shared<Model>(add, ParameterVector{input});
+
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::InitNodeInfo>();
+        pass_manager.register_pass<nvidia_gpu::pass::FullyConnectedTransformation>();
+        pass_manager.run_passes(model);
+
+        ASSERT_EQ(count_ops_of_type<op::v0::MatMul>(model), 0);
+    }
+    {
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{512, 1});
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{1024, 512}, {1});
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 1024}, {2});
+        auto fc = make_shared<FullyConnected>(input, const_node0, const_node1, true, true);
+        model_ref = make_shared<Model>(fc, ParameterVector{input});
+    }
+
+    auto res = compare_functions(model, model_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(fuse_matmul_add, constant_parameter_matmul_add_constant) {
+    shared_ptr<ov::Model> model, model_ref;
+    {
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{1024, 512}, {1});
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{1024, 1});
+        auto matmul = make_shared<op::v0::MatMul>(const_node0, input, true, false);
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 512}, {2});
+        auto add = make_shared<op::v1::Add>(matmul, const_node1);
+        model = make_shared<Model>(add, ParameterVector{input});
+
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::InitNodeInfo>();
+        pass_manager.register_pass<nvidia_gpu::pass::FullyConnectedTransformation>();
+        pass_manager.run_passes(model);
+
+        ASSERT_EQ(count_ops_of_type<op::v0::MatMul>(model), 0);
+    }
+    {
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{1024, 512}, {1});
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{1024, 1});
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 512}, {2});
+        auto fc = make_shared<FullyConnected>(const_node0, input, const_node1, true, false);
+        model_ref = make_shared<Model>(fc, ParameterVector{input});
+    }
+
+    auto res = compare_functions(model, model_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(fuse_matmul_add, parameter_variadic_split_matmul_add_constant) {
+    shared_ptr<ov::Model> model, model_ref;
+    {
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{197, 128});
+        auto split = make_shared<op::v1::VariadicSplit>(input,
+                                                     op::v0::Constant::create(element::i32, {}, {0}),
+                                                     op::v0::Constant::create(element::i32, Shape{2}, {196, 1}));
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{128, 128}, {1});
+        auto matmul = make_shared<op::v0::MatMul>(split->output(1), const_node0, false, true);
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 128}, {2});
+        auto add = make_shared<op::v1::Add>(matmul, const_node1);
+        auto concat = make_shared<op::v0::Concat>(OutputVector{split->output(0), add}, 0);
+        model = make_shared<Model>(concat, ParameterVector{input});
+
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::InitNodeInfo>();
+        pass_manager.register_pass<nvidia_gpu::pass::FullyConnectedTransformation>();
+        pass_manager.run_passes(model);
+
+        ASSERT_EQ(count_ops_of_type<op::v0::MatMul>(model), 0);
+    }
+    {
+        auto input = make_shared<op::v0::Parameter>(element::f32, Shape{197, 128});
+        auto split = make_shared<op::v1::VariadicSplit>(input,
+                                                     op::v0::Constant::create(element::i32, {}, {0}),
+                                                     op::v0::Constant::create(element::i32, Shape{2}, {196, 1}));
+        auto const_node0 = op::v0::Constant::create(element::f32, Shape{128, 128}, {1});
+        auto const_node1 = op::v0::Constant::create(element::f32, Shape{1, 128}, {2});
+        auto fc = make_shared<FullyConnected>(split->output(1), const_node0, const_node1, false, true);
+        auto concat = make_shared<op::v0::Concat>(OutputVector{split->output(0), fc}, 0);
+        model_ref = make_shared<Model>(concat, ParameterVector{input});
+    }
+
+    auto res = compare_functions(model, model_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+} // namespace testing