From 433e44e37d3df3ee2db1ebbf2cbf0651ad86f6f6 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Thu, 24 Oct 2024 20:56:32 +0100 Subject: [PATCH] [NPUW] Add Slice before last MatMul (#27229) Based on https://github.com/openvinotoolkit/openvino.genai/pull/814 --- .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 8 + .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../src/plugin/npuw/compiled_model.cpp | 11 ++ .../plugin/npuw/partitioning/patterns/opt.cpp | 158 ++++++++++++++++-- .../plugin/npuw/partitioning/patterns/opt.hpp | 21 +++ 6 files changed, 188 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 3eb7d3df218b41..7b0dab3d16da3c 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -43,6 +43,7 @@ DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime); DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime); +DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime); DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime); DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index a3eb4ecfa8cb63..5d6c6da22eb994 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -194,6 +194,14 @@ static constexpr ov::Property dyn_quant{"NPUW_DQ"}; */ static constexpr ov::Property par_matmul_merge_dims{"NPUW_PMM"}; +/** + * @brief + * Type: bool. + * Add Slice before the last MatMul reducing output's dimention. + * Default value: false. + */ +static constexpr ov::Property slice_out{"NPUW_SLICE_OUT"}; + /** * @brief * Type: boolean. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 3b108c2068b70d..6a519a0f754a32 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -28,6 +28,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index c6ef93ff1044be..69d68e020b887b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -146,6 +146,16 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, rewr.run_on_model(model); } + if (m_cfg.get<::intel_npu::NPUW_SLICE_OUT>()) { + // Add Slice before last MatMul for the prefill model + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + rewr.add_matcher(); + rewr.add_matcher(); + rewr.add_matcher(); + rewr.run_on_model(model); + } + auto partitioning = getPartitioning(model, m_cfg); m_total_stat.gflops = partitioning.total_gflops; m_total_stat.ops = partitioning.total_ops; @@ -906,6 +916,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::cwai, NPUW_CWAI), BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), BIND(npuw::partitioning::spatial, NPUW_SPATIAL), BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN), diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index ddf1449adb9d59..6040e1e112e894 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -6,18 +6,7 @@ #include "../../logging.hpp" #include "../../util.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/matmul.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reduce_sum.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/split.hpp" -#include "openvino/op/subtract.hpp" +#include "openvino/op/ops.hpp" #include "openvino/op/util/op_types.hpp" #include "openvino/pass/pattern/op/label.hpp" // any_input #include "openvino/pass/pattern/op/optional.hpp" @@ -1296,6 +1285,151 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { register_matcher(std::make_shared(res, "OptCompressDictMatMulf32"), std::move(callback)); } +SliceLastMatmul::SliceLastMatmul() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto res = opp::wrap_type({matmul}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmul"), std::move(callback)); +} + +SliceLastMatmulAdd::SliceLastMatmulAdd() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto add = opp::wrap_type({matmul, opp::any_input()}); + auto res = opp::wrap_type({add}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulAdd"), std::move(callback)); +} + +SliceLastMatmulTranspose::SliceLastMatmulTranspose() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto add = opp::wrap_type({matmul, opp::any_input()}); + auto res = opp::wrap_type({matmul}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulTranspose"), std::move(callback)); +} + +SliceLastMatmulMultiply::SliceLastMatmulMultiply() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto div = opp::wrap_type({matmul, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto multiply = opp::wrap_type({tanh, opp::any_input()}); + auto res = opp::wrap_type({multiply}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulMultiply"), std::move(callback)); +} + } // namespace opt } // namespace patterns } // namespace npuw diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index b649f6a136c2e7..a66012d4a85fb8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -149,6 +149,27 @@ class CompressDictMatMulf32 : public ov::pass::MatcherPass { CompressDictMatMulf32(Context::Ref ctx); }; +// Slice last Matmul +class SliceLastMatmul : public ov::pass::MatcherPass { +public: + SliceLastMatmul(); +}; + +class SliceLastMatmulAdd : public ov::pass::MatcherPass { +public: + SliceLastMatmulAdd(); +}; + +class SliceLastMatmulTranspose : public ov::pass::MatcherPass { +public: + SliceLastMatmulTranspose(); +}; + +class SliceLastMatmulMultiply : public ov::pass::MatcherPass { +public: + SliceLastMatmulMultiply(); +}; + } // namespace opt } // namespace patterns } // namespace npuw