diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index 39423980521042..f4495fb5dd1645 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool compressed_case = fc_prim->compressed_weights && one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && - one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); + one_of(out_dt, {data_types::f16, data_types::f32}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 770aa387da8a60..305e21a5000149 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -15,11 +15,8 @@ #include "intel_gpu/plugin/transformations_pipeline.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" -#include "low_precision/add.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" -#include "low_precision/fold_convert.hpp" -#include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" @@ -28,9 +25,7 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" -#include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" -#include "low_precision/transpose.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -51,7 +46,6 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/rnn_cell.hpp" #include "openvino/op/rnn_sequence.hpp" -#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -319,9 +313,13 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression // types are not supported by oneDNN) manager.register_pass(supported_woq_types, !device_info.supports_immad); - pass_config->set_callback([&](const std::shared_ptr node) { - return !is_decompression_multiply(node); - }); + + // Need to check if transformations work correctly for mixed models with both compression and quantization at the same time. + if (!is_model_quantized) { + pass_config->set_callback([&](const std::shared_ptr node) { + return !is_decompression_multiply(node); + }); + } const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -690,6 +688,12 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lptPassConfig = lptManager.get_pass_config(); // quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation lptPassConfig->disable(); + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + if (const auto mulitply = std::dynamic_pointer_cast(node)) { + return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply); + } + return false; + }); lptPassConfig->set_callback([func, defaultPrecisions](const_node_ptr& node) -> bool { auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool { const auto rank = shape.rank(); @@ -726,43 +730,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { - for (auto& user : node->get_users()) { - if (ov::is_type(user)) - return true; - } - - return false; - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::is_type(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node); - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::marked_as_bias(node); - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - return false; - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - if (ov::is_type(node)) { - return ov::is_type(node) && is_decompression_multiply(node); - } else if (ov::is_type(node)) { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - } - return false; - }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { // disable MultiplyToGroupConvolution if Multiply with Constant can be fused