diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index b1a2fb004f581..a005d63034322 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -401,8 +401,9 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_ case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { - QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", qnn::get_backend_name(ctx->device), - ggml_type_name(tensor->type), (unsigned int) ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), + (unsigned int) ctx->supported_types); return false; } break; @@ -455,16 +456,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n"); return false; } - // fall through, from test here, the convert op is super slow on NPU: - // https://github.com/usefulsensors/qc_npu_benchmark - case QNN_BACKEND_GPU: - if (src0->type != src1->type || src0->type != op->type) { - // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n", - ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type)); - return false; - } break; + case QNN_BACKEND_GPU: default: break; } @@ -503,9 +496,9 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor #ifndef NDEBUG std::string tensor_dims; append_tensor_dimensions(op, tensor_dims); - QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - ggml_op_name(op->op), tensor_dims.c_str(), ctx->supported_op_count.load(), - ctx->unsupported_op_count.load()); + QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(), + ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); #endif return false; } diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index c6f94aa2a89ac..3778beab4b61d 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -300,11 +300,6 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs) { - if (device == QNN_BACKEND_GPU) { - // there's no convert op for GPU, so we should create matmul nodes directly. - return true; - } - // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); @@ -321,7 +316,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + QNN_OP_CAST, _qnn_instance); convert->set_input_tensors({ convert_in }); convert->set_output_tensors({ convert_out }); tensor_inputs[i] = convert_out; @@ -336,7 +331,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert_out->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + QNN_OP_CAST, _qnn_instance); output_convert->set_input_tensors({ convert_in }); output_convert->set_output_tensors({ convert_out }); tensor_outputs.front() = convert_in;