diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/matmulnbits.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/matmulnbits.cpp new file mode 100644 index 00000000000000..5b8a439933efd1 --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/com.microsoft/matmulnbits.cpp @@ -0,0 +1,236 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cmath" +#include "core/operator_set.hpp" +#include "exceptions.hpp" +#include "openvino/frontend/exception.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "utils/reshape.hpp" + +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace onnx { +namespace com_microsoft { +namespace opset_1 { +ov::OutputVector matmulnbits(const ov::frontend::onnx::Node& node) { + // Original documentation: + // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.MatMulNBits + const auto inputs = node.get_ov_inputs(); + FRONT_END_OP_CONVERSION_CHECK(inputs.size() >= 3, "Minimum 3 inputs are required. Got: ", inputs.size()); + const auto& a = inputs[0]; // required + ov::Output b; + const auto& b_quantized = inputs[1]; // required + const auto& scales = inputs[2]; // required + ov::Output zero_points; // optional, input[3] + ov::Output group_idx; // optional, input[4] + ov::Output bias; // optional, input[5] + const auto K = node.get_attribute_value("K"); // required + const auto N = node.get_attribute_value("N"); // required + const auto accuracy_level = node.get_attribute_value("accuracy_level", 0); // optional, default unset(0) + const auto block_size = node.get_attribute_value("block_size"); // required + const auto bits = node.get_attribute_value( + "bits", + 4); // required, in docs: number of bits used for weight quantization (default 4) + + const uint64_t n_blocks_per_col = (K + block_size - 1) / block_size; + const auto blob_size = static_cast(ceil(block_size * bits / 8)); + + CHECK_VALID_NODE(node, n_blocks_per_col > 0, "Wrong blocks count: ", n_blocks_per_col); + CHECK_VALID_NODE(node, blob_size > 0, "Wrong blob size: ", blob_size); + // in documentation: ...Input B is a 2D constant Matrix. + CHECK_VALID_NODE(node, + dynamic_cast(b_quantized.get_node()) != nullptr, + "MatMulNBits limitation: accepting only a constant as a B input"); + CHECK_VALID_NODE(node, + b_quantized.get_partial_shape().rank() == 3, + "Expected rank of quantized weights is 3 [N][n_blocks_per_col][blob_size], got: ", + b_quantized.get_partial_shape().rank()); + CHECK_VALID_NODE(node, + a.get_element_type() == ov::element::f16 || a.get_element_type() == ov::element::f32, + "Unsupported input A type, accepted FP16, FP32, got: ", + a.get_element_type()); + CHECK_VALID_NODE( + node, + b_quantized.get_element_type() == ov::element::u8 || b_quantized.get_element_type() == ov::element::i32, + "Unsupported input B type, accepted FP16, FP32, got: ", + b_quantized.get_element_type()); + + CHECK_VALID_NODE(node, + block_size >= 16 && (block_size % 2 == 0), + "Wrong block size, should be >=16 and be a power of 2, got: ", + block_size); + CHECK_VALID_NODE(node, accuracy_level >= 0 && accuracy_level <= 4, "Unsupported accuracy level: ", accuracy_level); + + if (inputs.size() > 3) { + zero_points = inputs[3]; + CHECK_VALID_NODE(node, + zero_points.get_element_type() == ov::element::u8 || + zero_points.get_element_type() == ov::element::i32 || + zero_points.get_element_type() == ov::element::f32 || + zero_points.get_element_type() == ov::element::f16, + "Unsupported input zero_points type, accepted U8, I32, FP16, FP32, got: ", + zero_points.get_element_type()); + } + + if (inputs.size() > 4) { + group_idx = inputs[4]; + CHECK_VALID_NODE(node, + group_idx.get_element_type() == ov::element::i32, + "Unsupported input group_idx type, accepted I32, got: ", + group_idx.get_element_type()); + } + + if (inputs.size() > 5) { + bias = inputs[5]; + CHECK_VALID_NODE(node, + bias.get_element_type() == a.get_element_type(), + "Unsupported input bias type, must be equal to input A type, got: ", + bias.get_element_type()); + CHECK_VALID_NODE(node, + bias.get_partial_shape() == PartialShape{N}, + "Wrong bias shape, expected [", + N, + "], got: ", + bias.get_partial_shape()); + } + + { + const auto b_const = std::dynamic_pointer_cast(b_quantized.get_node_shared_ptr()); + + ov::Output casted_b; + ov::Shape casted_b_shape; + ov::Output default_zp; + // Casting/converting data of source constant. + // For further calculations (sub and/or multiply) we need to reshape it from [N][n_blocks_per_col][blob_size * + // X] to [N * n_blocks_per_col][blob_size * X] (where X is amount of values in 1 byte) because scale and + // zero_point are represented as: ...with shape like: [N * n_blocks_per_col]... + switch (bits) { + case 2: + casted_b_shape = ov::Shape{static_cast(N * n_blocks_per_col), static_cast(blob_size * 4)}; + casted_b = std::make_shared(ov::element::u2, casted_b_shape, b_const->get_data_ptr()); + default_zp = std::make_shared(a.get_element_type(), Shape{}, 2); + break; + case 4: + casted_b_shape = ov::Shape{static_cast(N * n_blocks_per_col), static_cast(blob_size * 2)}; + casted_b = std::make_shared(ov::element::u4, casted_b_shape, b_const->get_data_ptr()); + default_zp = std::make_shared(a.get_element_type(), Shape{}, 8); + break; + case 8: + casted_b_shape = ov::Shape{static_cast(N * n_blocks_per_col), static_cast(blob_size)}; + casted_b = op::util::reshape(b_const, casted_b_shape); + default_zp = std::make_shared(a.get_element_type(), Shape{}, 128); + break; + default: + FRONT_END_THROW("Unsupported bits count"); + break; + } + + // Possible issue with slice implementation, had to move convertion before slice, instead of slicing uint4 + // TODO: Ticket + const auto converted_b = std::make_shared(casted_b, a); + + // TODO: Need to collect performance data in case constant folding is applied. Possible some perf/mem-gap + + // Simple case + if (n_blocks_per_col == 1) { + // Removing unused items in case block is bigger than column count + // For example, if data is (uint8)[1,2,3,4,5,6] then block will be (uint8)[1,2,3,4,5,6,0,0,0,0,0,0,0,0,0,0]. + // And last zeros are unused. + const auto zero_const = std::make_shared(ov::element::i32, Shape{1}, 0); + const auto one_const = std::make_shared(ov::element::i32, Shape{1}, 1); + const auto elements_const = + std::make_shared(ov::element::i32, Shape{1}, static_cast(K)); + const auto axis_const = std::make_shared(ov::element::i32, Shape{1}, 1); + const auto slice_b = + std::make_shared(converted_b, zero_const, elements_const, one_const, axis_const); + + // Transpose matrix + const auto transposed_shape = + std::make_shared(ov::element::i64, Shape{2}, std::vector{1, 0}); + const auto transposed_b = std::make_shared(slice_b, transposed_shape); + + // If no zero-points provided - we generate default, depends on data size + if (!zero_points.get_node_shared_ptr()) { + zero_points = default_zp; + } + const auto sub_b = std::make_shared(transposed_b, zero_points); + + // Scaling + const auto scaled_b = std::make_shared(sub_b, scales); + + // Adding bias if required + if (!bias.get_node_shared_ptr()) { + b = scaled_b; + } else { + b = std::make_shared(scaled_b, bias); + } + } else { + // Transpose matrix. Quantized B matrix is transposed and has a shape [N,K]. + // To apply further operations on it which operand's shape is [N] we do this + // transpose to have a matrix [K,N]... + const auto transposed_shape = + std::make_shared(ov::element::i64, Shape{2}, std::vector{1, 0}); + ov::Output transposed_b = std::make_shared(converted_b, transposed_shape); + + // If no zero-points provided - we generate default, depends on data size + if (!zero_points.get_node_shared_ptr()) { + zero_points = default_zp; + } + const auto sub_b = std::make_shared(transposed_b, zero_points); + + // Scaling + const auto scaled_b = std::make_shared(sub_b, scales); + + // Transpose again to make reshaping and slicing + transposed_b = std::make_shared(scaled_b, transposed_shape); + + const auto reshaped_b = + op::util::reshape(transposed_b, + ov::Shape{static_cast(casted_b_shape[0] / n_blocks_per_col), + static_cast(casted_b_shape[1] * n_blocks_per_col)}); + + // Removing unused items in case block is bigger than column count (see description for + // Slice above) + const auto zero_const = std::make_shared(ov::element::i32, Shape{1}, 0); + const auto one_const = std::make_shared(ov::element::i32, Shape{1}, 1); + const auto elements_const = + std::make_shared(ov::element::i32, Shape{1}, static_cast(K)); + const auto axis_const = std::make_shared(ov::element::i32, Shape{1}, 1); + const auto slice_b = + std::make_shared(reshaped_b, zero_const, elements_const, one_const, axis_const); + + // Adding bias if required + if (!bias.get_node_shared_ptr()) { + return {std::make_shared(a, slice_b, false, true)}; + } else { + // Transpose again + transposed_b = std::make_shared(slice_b, transposed_shape); + + b = std::make_shared(transposed_b, bias); + } + } + } + + return {std::make_shared(a, b)}; +} + +ONNX_OP("MatMulNBits", OPSET_SINCE(1), com_microsoft::opset_1::matmulnbits, MICROSOFT_DOMAIN); + +} // namespace opset_1 +} // namespace com_microsoft +} // namespace onnx +} // namespace frontend +} // namespace ov diff --git a/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x17.prototxt b/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x17.prototxt new file mode 100644 index 00000000000000..18c97a15c0f48a --- /dev/null +++ b/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x17.prototxt @@ -0,0 +1,92 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +producer_version: "" +model_version: 0 +graph { + name: "test_matmul_2d" + node { + input: "a" + input: "b_Q4" + input: "b_scales" + output: "c" + op_type: "MatMulNBits" + attribute { + name: "K" + i: 17 + type: INT + } + attribute { + name: "N" + i: 3 + type: INT + } + attribute { + name: "accuracy_level" + i: 4 + type: INT + } + attribute { + name: "bits" + i: 4 + type: INT + } + attribute { + name: "block_size" + i: 16 + type: INT + } + domain: "com.microsoft" + } + initializer { + dims: 3 + dims: 2 + dims: 8 + data_type: 2 + name: "b_Q4" + raw_data: "G\2025`\024G\2025\200\000\000\000\000\000\000\000Fq$X\003Fq$\210\000\000\000\000\000\000\0005`\024G\2025`\024\200\000\000\000\000\000\000\000" + } + initializer { + dims: 6 + data_type: 1 + name: "b_scales" + raw_data: "\000\000\220\277\000\000\220\277\000\000\220\277\000\000\000\200\000\000\220\277\000\000\000\276" + } + input { + name: "a" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 17 + } + } + } + } + } + output { + name: "c" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 3 + } + } + } + } + } +} +opset_import { + version: 7 +} +opset_import { + version: 1 +} diff --git a/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x4.prototxt b/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x4.prototxt new file mode 100644 index 00000000000000..1db034a3caaf1f --- /dev/null +++ b/src/frontends/onnx/tests/models/com.microsoft/matmulnbits_3x4.prototxt @@ -0,0 +1,92 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +producer_version: "" +model_version: 0 +graph { + name: "test_matmul_2d" + node { + input: "a" + input: "b_Q4" + input: "b_scales" + output: "c" + op_type: "MatMulNBits" + attribute { + name: "K" + i: 4 + type: INT + } + attribute { + name: "N" + i: 3 + type: INT + } + attribute { + name: "accuracy_level" + i: 4 + type: INT + } + attribute { + name: "bits" + i: 4 + type: INT + } + attribute { + name: "block_size" + i: 16 + type: INT + } + domain: "com.microsoft" + } + initializer { + dims: 3 + dims: 1 + dims: 8 + data_type: 2 + name: "b_Q4" + raw_data: "&P\000\000\000\000\000\000\005b\000\000\000\000\000\000\004\204\000\000\000\000\000\000" + } + initializer { + dims: 3 + data_type: 1 + name: "b_scales" + raw_data: "\000\000 \277\000\000 \277\000\000@\277" + } + input { + name: "a" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "c" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 3 + } + } + } + } + } +} +opset_import { + version: 7 +} +opset_import { + version: 1 +} diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index 9c017378611f4f..da8189926a4546 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -1300,3 +1300,33 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_pad_1d) { test_case.run(); } + +OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x4) { + const auto model = convert_model("com.microsoft/matmulnbits_3x4.onnx"); + auto test_case = ov::test::TestCase(model, s_device); + + test_case.add_input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + test_case.add_expected_output(Shape{3, 3}, + {31.25f, 28.125f, 24.f, 78.75f, 75.625f, 72.f, 126.25f, 123.125f, 120.f}); + + test_case.run(); +} + +OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17) { + const auto model = convert_model("com.microsoft/matmulnbits_3x17.onnx"); + auto test_case = ov::test::TestCase(model, s_device); + + test_case.add_input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1}); + + if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) { + test_case.add_expected_output( + Shape{3, 3}, + {425.25f, 372.5f, 352.25f, 446.5f, 448.75f, 476.5f, 400.25f, 480.5f, 533.f}); + } else { + test_case.add_expected_output( + Shape{3, 3}, + {425.25f, 372.375f, 352.375f, 446.625f, 448.875f, 476.5f, 400.5f, 480.375f, 533.125f}); + } + test_case.run(); +}