diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp
index b1a2fb004f581..a005d63034322 100644
--- a/ggml/src/ggml-qnn/backend-ops.cpp
+++ b/ggml/src/ggml-qnn/backend-ops.cpp
@@ -401,8 +401,9 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_Q4_0:
             if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) {
-                QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", qnn::get_backend_name(ctx->device),
-                              ggml_type_name(tensor->type), (unsigned int) ctx->supported_types);
+                QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n",
+                              qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type),
+                              (unsigned int) ctx->supported_types);
                 return false;
             }
             break;
@@ -455,16 +456,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg
                 QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n");
                 return false;
             }
-            // fall through, from test here, the convert op is super slow on NPU:
-            //   https://github.com/usefulsensors/qc_npu_benchmark
-        case QNN_BACKEND_GPU:
-            if (src0->type != src1->type || src0->type != op->type) {
-                // there's no convert op for GPU.
-                QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n",
-                              ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type));
-                return false;
-            }
             break;
+        case QNN_BACKEND_GPU:
         default:
             break;
     }
@@ -503,9 +496,9 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor
 #ifndef NDEBUG
         std::string tensor_dims;
         append_tensor_dimensions(op, tensor_dims);
-        QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device),
-                      ggml_op_name(op->op), tensor_dims.c_str(), ctx->supported_op_count.load(),
-                      ctx->unsupported_op_count.load());
+        QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n",
+                      qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(),
+                      ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
 #endif
         return false;
     }
diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp
index c6f94aa2a89ac..3778beab4b61d 100644
--- a/ggml/src/ggml-qnn/op-config-impl.cpp
+++ b/ggml/src/ggml-qnn/op-config-impl.cpp
@@ -300,11 +300,6 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
 bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
                                                      qnn_tensor_array_t & tensor_inputs,
                                                      qnn_tensor_array_t & tensor_outputs) {
-    if (device == QNN_BACKEND_GPU) {
-        // there's no convert op for GPU, so we should create matmul nodes directly.
-        return true;
-    }
-
     // create tensors for convert node
     auto tensor_type = get_tensor_type(tensor_inputs);
     QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type));
@@ -321,7 +316,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
                                                              convert_in->get_dimensions(), tensor_type, rank, device,
                                                              graph_handle, _qnn_instance);
         auto convert     = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                                       QNN_OP_CONVERT, _qnn_instance);
+                                                                       QNN_OP_CAST, _qnn_instance);
         convert->set_input_tensors({ convert_in });
         convert->set_output_tensors({ convert_out });
         tensor_inputs[i] = convert_out;
@@ -336,7 +331,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
                                                                     convert_out->get_dimensions(), tensor_type, rank, device,
                                                                     graph_handle, _qnn_instance);
         auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                                          QNN_OP_CONVERT, _qnn_instance);
+                                                                          QNN_OP_CAST, _qnn_instance);
         output_convert->set_input_tensors({ convert_in });
         output_convert->set_output_tensors({ convert_out });
         tensor_outputs.front() = convert_in;