diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py index 34efb1d7a1624..5e0ad2796a16c 100644 --- a/python/tvm/relay/op/contrib/cmsisnn.py +++ b/python/tvm/relay/op/contrib/cmsisnn.py @@ -123,6 +123,16 @@ def check_qnn_conv2d(pattern): kernel_zp = conv2d.args[3].data.numpy() kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp + # check if depthwise Conv2D + kernel_layout = conv2d.attrs.kernel_layout + pos_o = kernel_layout.index("O") + groups = conv2d.attrs.groups + is_depthwise = False + if groups == int(conv2d_input.checked_type.shape[3]) and groups == int( + conv2d_weight.checked_type.shape[pos_o] + ): + is_depthwise = True + return ( conv2d.attrs.out_dtype == "int32" and conv2d.attrs.padding[2] == 0 @@ -132,6 +142,7 @@ def check_qnn_conv2d(pattern): and pattern.checked_type.dtype == "int8" and bias_dtype == "int32" and all([zp == 0 for zp in kernel_zp]) + and (not is_depthwise or bias_add is not None) ) def binary_op_pattern(op): diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc index 0231e8b521178..0a94f348b55fb 100644 --- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc +++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc @@ -105,11 +105,25 @@ class GenerateConstantsMutator : public MixedModeMutator { conv2d_call = requantize_input; } - // Transpose weights: HWIO -> OHWI auto* conv2d_attrs = conv2d_call->attrs.as(); - tvm::Attrs new_conv2d_attrs; - Expr transposed_kernel = - ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs); + tvm::Attrs new_conv2d_attrs = conv2d_call->attrs; + Expr conv2d_kernel = conv2d_call->args[1]; + + bool is_depthwise = false; + Array input_shape = conv2d_call->args[0]->type_as()->shape; + Array kernel_shape = conv2d_call->args[1]->type_as()->shape; + std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); + int kernel_pos_o = kernel_layout.find("O"); + int groups = conv2d_attrs->groups; + if (groups == qnn::get_const_int(input_shape[3]) && + groups == qnn::get_const_int(kernel_shape[kernel_pos_o])) { + is_depthwise = true; + } + + // Transpose weights: HWIO -> OHWI for Conv2D + if (!is_depthwise) { + conv2d_kernel = ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs); + } // Obtain input and output scales from Relay's Requantization int64_t out_channels = conv2d_attrs->channels.as()->value; @@ -153,11 +167,11 @@ class GenerateConstantsMutator : public MixedModeMutator { req_inp_scale = Constant(req_inp_scale_nda); } - // Replace existing weights (HWIO) with the transposed ones (OHWI) + // Replace existing weights (HWIO) with the transposed ones (OHWI) for Conv2D // Substitute Conv2D weight_zero_point with the CMSIS-NN multiplier // Substitute Requantize input_zero_point with CMSIS-NN shift // Conv2D arguments: data, weight, input_zp, weight_zp, input_sc, weight_sc - Array conv2d_args = {conv2d_call->args[0], transposed_kernel, conv2d_call->args[2], + Array conv2d_args = {conv2d_call->args[0], conv2d_kernel, conv2d_call->args[2], multiplier_const, conv2d_call->args[4], weight_scale}; Call ret_call = Call(conv2d_call->op, conv2d_args, new_conv2d_attrs, {}); if (bias_add_call) { diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 1b639dd36e9d5..668352700805a 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -146,6 +146,9 @@ class RelayToTIRVisitor : public MixedModeMutator { int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]); int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]); int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]); + int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels); + int32_t groups = conv2d_attrs->groups; + std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); int32_t clip_min, clip_max; if (clip_call) { const ClipAttrs* clip_attrs = clip_call->attrs.as(); @@ -156,14 +159,6 @@ class RelayToTIRVisitor : public MixedModeMutator { clip_max = 127; } - tvm::Array call_ext_args = {tir::StringImm("arm_convolve_wrapper_s8"), input, filter, - multiplier}; - if (bias_add_call) { - call_ext_args.push_back(bias); - } - call_ext_args.push_back(shift); - call_ext_args.push_back(output); - tvm::Array scalar_args = {ToArg(input_offset), ToArg(output_offset), ToArg(stride_w), ToArg(stride_h), ToArg(padding_w), ToArg(padding_h), ToArg(dilation_w), ToArg(dilation_h), ToArg(clip_min), @@ -173,18 +168,42 @@ class RelayToTIRVisitor : public MixedModeMutator { Array input_shape = conv2d_call->args[0]->type_as()->shape; Array input_dims = CMSISNNDimensions(input_shape); - // cmsis_nn_dims *filter_dims (OHWI) + // cmsis_nn_dims *filter_dims (OHWI for Conv2D and IHWO for depthwise) Array filter_shape = conv2d_call->args[1]->type_as()->shape; Array filter_dims = CMSISNNDimensions(filter_shape); - // cmsis_nn_dims *bias_dims (1,1,1,output_channels) - Array bias_shape{1, 1, 1, filter_shape[0]}; + // cmsis_nn_dims *bias_dims + Array bias_shape{1, 1, 1, out_channels}; Array bias_dims = CMSISNNDimensions(bias_shape); - // cmsis_nn_dims *output_dims (NHWC) + // cmsis_nn_dims *output_dims (same order as input_dims) Array output_shape = conv2d_call->type_as()->shape; Array output_dims = CMSISNNDimensions(output_shape); + int32_t depth_multiplier = -1; + int kernel_pos_o = kernel_layout.find("O"); + if (groups == qnn::get_const_int(input_shape[3]) && + groups == qnn::get_const_int(filter_shape[kernel_pos_o])) { + int kernel_pos_i = kernel_layout.find("I"); + depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_i]); + } + scalar_args.push_back(ToArg(depth_multiplier)); + + // original filter_layout for depthwise is HWOI + std::string cmsisnn_api = "arm_convolve_wrapper_s8"; + if (depth_multiplier != -1) { + cmsisnn_api = "arm_depthwise_conv_wrapper_s8"; + Array depthwise_filter_shape{1, filter_shape[0], filter_shape[1], out_channels}; + filter_dims = CMSISNNDimensions(depthwise_filter_shape); + } + + tvm::Array call_ext_args = {tir::StringImm(cmsisnn_api), input, filter, multiplier}; + if (bias_add_call) { + call_ext_args.push_back(bias); + } + call_ext_args.push_back(shift); + call_ext_args.push_back(output); + // https://github.com/ARM-software/CMSIS_5/blob/d788fd583984388553391de18afd8b4d2a146868/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c#L367 std::string context_buffer_name = "NULL"; size_t context_buffer_size = diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc index bf56770257f74..95d0519db67ba 100644 --- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc +++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc @@ -69,7 +69,8 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" || cmsis_func_name == "arm_elementwise_add_s8") { CodeGenC::VisitExpr_(op, os); - } else if (cmsis_func_name == "arm_convolve_wrapper_s8") { + } else if (cmsis_func_name == "arm_convolve_wrapper_s8" || + cmsis_func_name == "arm_depthwise_conv_wrapper_s8") { EmitConv2D(op); } return; @@ -87,8 +88,12 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { std::string EmitCMSISNNConvParams(std::ostream& os, int32_t input_offset, int32_t output_offset, int32_t stride_w, int32_t stride_h, int32_t padding_w, int32_t padding_h, int32_t dilation_w, int32_t dilation_h, - int32_t clip_min, int32_t clip_max) { - std::string struct_name = "conv_params"; + int32_t clip_min, int32_t clip_max, int32_t depth_multiplier) { + std::string struct_name = "cmsis_nn_conv_params"; + std::string instance_name = "conv_params"; + if (depth_multiplier != -1) { + struct_name = "cmsis_nn_dw_conv_params"; + } PrintIndent(); os << "cmsis_nn_tile stride = {" << stride_w << "," << stride_h << "};\n"; PrintIndent(); @@ -98,9 +103,12 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { PrintIndent(); os << "cmsis_nn_activation activation = {" << clip_min << "," << clip_max << "};\n"; PrintIndent(); - os << "cmsis_nn_conv_params " << struct_name << " = {" << input_offset << ", " << output_offset - << ", stride, padding, dilation, activation};\n"; - return struct_name; + os << struct_name << " " << instance_name << " = {" << input_offset << ", " << output_offset; + if (depth_multiplier != -1) { + os << ", " << depth_multiplier; + } + os << ", stride, padding, dilation, activation};\n"; + return instance_name; } /*! * \brief Emits cmsis_nn_per_channel_quant_params struct */ @@ -125,7 +133,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { /*! * \brief Emits CMSIS-NN APIs for every call_extern */ void EmitConv2D(const CallNode* op) { - static const int max_num_args = 35; + static const int max_num_args = 36; std::string cmsis_func_name = op->args[0].as()->value; bool bias_enabled = false; @@ -143,7 +151,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { std::string input_data = get_var_name(op, ++arg_id); std::string filter_data = get_var_name(op, ++arg_id); std::string multiplier = get_var_name(op, ++arg_id); - std::string bias_data("0x0"); + std::string bias_data("NULL"); if (bias_enabled) { bias_data = get_var_name(op, ++arg_id); } @@ -162,6 +170,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { int dilation_h = get_arg_value(op, ++arg_id); int clip_min = get_arg_value(op, ++arg_id); int clip_max = get_arg_value(op, ++arg_id); + int depth_multiplier = get_arg_value(op, ++arg_id); int input_n = get_arg_value(op, ++arg_id); int input_h = get_arg_value(op, ++arg_id); int input_w = get_arg_value(op, ++arg_id); @@ -180,9 +189,9 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { int output_c = get_arg_value(op, ++arg_id); std::string context = EmitCMSISNNContext(stream, context_buffer_name, context_buffer_size); - std::string conv_params = - EmitCMSISNNConvParams(stream, input_offset, output_offset, stride_w, stride_h, padding_w, - padding_h, dilation_w, dilation_h, clip_min, clip_max); + std::string conv_params = EmitCMSISNNConvParams( + stream, input_offset, output_offset, stride_w, stride_h, padding_w, padding_h, dilation_w, + dilation_h, clip_min, clip_max, depth_multiplier); std::string quant_params = EmitCMSISNNPerChannelQuantParams(stream, multiplier, shift); std::string input_dim = EmitCMSISNNDims(stream, "input", input_n, input_h, input_w, input_c); std::string filter_dim = diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py index c68dfb573839d..df6ae2edabc7a 100644 --- a/tests/python/contrib/test_cmsisnn/test_conv2d.py +++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py @@ -71,21 +71,20 @@ def make_model( p = (0, 0, 0, 0) if padding == "SAME": p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) - a = relay.nn.pad( - a, + invar = relay.nn.pad( + invar, pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)], pad_value=input_zero_point, pad_mode="constant", ) shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3]) - weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels) rng = np.random.default_rng(12321) w = tvm.nd.array( rng.integers( np.iinfo(kernel_dtype).min, high=np.iinfo(kernel_dtype).max, - size=weight_shape, + size=kernel_shape, dtype=kernel_dtype, ) ) @@ -129,8 +128,11 @@ def make_model( @pytest.mark.parametrize("kernel_size", [(3, 3)]) @pytest.mark.parametrize("padding", ["SAME", "VALID"]) @pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))]) -@pytest.mark.parametrize("enable_bias", [True, False]) @pytest.mark.parametrize("relu_type", ["NONE", "RELU"]) +@pytest.mark.parametrize( + "conv_type, depth_multiplier, enable_bias", + [("conv2d", 1, True), ("conv2d", 1, False), ("depthwise", 1, True), ("depthwise", 3, True)], +) @pytest.mark.parametrize( "input_zero_point, input_scale, kernel_scale, out_channels", [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)], @@ -147,27 +149,32 @@ def test_op_int8( input_scale, kernel_scale, out_channels, + conv_type, + depth_multiplier, ): interface_api = "c" use_unpacked_api = True test_runner = AOT_CORSTONE300_RUNNER - kernel_zero_point = 0 + dtype = "int8" groups = 1 weight_format = "HWIO" kernel_h = kernel_size[0] kernel_w = kernel_size[1] - dtype = "int8" + kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels) + kernel_zero_point = 0 in_min, in_max = get_range_for_dtype_str(dtype) - weight_shape = None - if weight_format == "HWIO": - weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels) - else: - weight_shape = (kernel_h, kernel_w, ifm_shape[3], out_channels) + if conv_type == "depthwise": + groups = ifm_shape[3] + weight_format = "HWOI" + kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier) + out_channels = ifm_shape[3] * depth_multiplier + ks_len = len(kernel_scale) + kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)] output_scale, output_zero_point = get_conv2d_qnn_params( - weight_shape, + kernel_shape, input_scale, input_zero_point, kernel_scale, @@ -175,12 +182,12 @@ def test_op_int8( dtype, dtype, dtype, - False, + conv_type == "depthwise", ) model, params = make_model( ifm_shape, - weight_shape, + kernel_shape, input_zero_point, input_scale, kernel_zero_point, diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py index 7af2a0ee99a3d..2ef84d7f1a6f8 100644 --- a/tests/python/driver/tvmc/test_compiler.py +++ b/tests/python/driver/tvmc/test_compiler.py @@ -416,7 +416,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn( for name in mlf_package.getnames() if re.match(r"\./codegen/host/src/\D+\d+\.c", name) ] - assert len(c_source_files) == 5 + assert len(c_source_files) == 3 @pytest.mark.skipif(