diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 34efb1d7a1624..5e0ad2796a16c 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -123,6 +123,16 @@ def check_qnn_conv2d(pattern):
         kernel_zp = conv2d.args[3].data.numpy()
         kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
 
+        # check if depthwise Conv2D
+        kernel_layout = conv2d.attrs.kernel_layout
+        pos_o = kernel_layout.index("O")
+        groups = conv2d.attrs.groups
+        is_depthwise = False
+        if groups == int(conv2d_input.checked_type.shape[3]) and groups == int(
+            conv2d_weight.checked_type.shape[pos_o]
+        ):
+            is_depthwise = True
+
         return (
             conv2d.attrs.out_dtype == "int32"
             and conv2d.attrs.padding[2] == 0
@@ -132,6 +142,7 @@ def check_qnn_conv2d(pattern):
             and pattern.checked_type.dtype == "int8"
             and bias_dtype == "int32"
             and all([zp == 0 for zp in kernel_zp])
+            and (not is_depthwise or bias_add is not None)
         )
 
     def binary_op_pattern(op):
diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 0231e8b521178..0a94f348b55fb 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -105,11 +105,25 @@ class GenerateConstantsMutator : public MixedModeMutator {
       conv2d_call = requantize_input;
     }
 
-    // Transpose weights: HWIO -> OHWI
     auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
-    tvm::Attrs new_conv2d_attrs;
-    Expr transposed_kernel =
-        ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs);
+    tvm::Attrs new_conv2d_attrs = conv2d_call->attrs;
+    Expr conv2d_kernel = conv2d_call->args[1];
+
+    bool is_depthwise = false;
+    Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
+    Array<PrimExpr> kernel_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
+    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
+    int kernel_pos_o = kernel_layout.find("O");
+    int groups = conv2d_attrs->groups;
+    if (groups == qnn::get_const_int(input_shape[3]) &&
+        groups == qnn::get_const_int(kernel_shape[kernel_pos_o])) {
+      is_depthwise = true;
+    }
+
+    // Transpose weights: HWIO -> OHWI for Conv2D
+    if (!is_depthwise) {
+      conv2d_kernel = ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs);
+    }
 
     // Obtain input and output scales from Relay's Requantization
     int64_t out_channels = conv2d_attrs->channels.as<IntImmNode>()->value;
@@ -153,11 +167,11 @@ class GenerateConstantsMutator : public MixedModeMutator {
       req_inp_scale = Constant(req_inp_scale_nda);
     }
 
-    // Replace existing weights (HWIO) with the transposed ones (OHWI)
+    // Replace existing weights (HWIO) with the transposed ones (OHWI) for Conv2D
     // Substitute Conv2D weight_zero_point with the CMSIS-NN multiplier
     // Substitute Requantize input_zero_point with CMSIS-NN shift
     // Conv2D arguments: data, weight, input_zp, weight_zp, input_sc, weight_sc
-    Array<Expr> conv2d_args = {conv2d_call->args[0], transposed_kernel,    conv2d_call->args[2],
+    Array<Expr> conv2d_args = {conv2d_call->args[0], conv2d_kernel,        conv2d_call->args[2],
                                multiplier_const,     conv2d_call->args[4], weight_scale};
     Call ret_call = Call(conv2d_call->op, conv2d_args, new_conv2d_attrs, {});
     if (bias_add_call) {
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 1b639dd36e9d5..668352700805a 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -146,6 +146,9 @@ class RelayToTIRVisitor : public MixedModeMutator {
     int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
     int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
     int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels);
+    int32_t groups = conv2d_attrs->groups;
+    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
     int32_t clip_min, clip_max;
     if (clip_call) {
       const ClipAttrs* clip_attrs = clip_call->attrs.as<ClipAttrs>();
@@ -156,14 +159,6 @@ class RelayToTIRVisitor : public MixedModeMutator {
       clip_max = 127;
     }
 
-    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm("arm_convolve_wrapper_s8"), input, filter,
-                                          multiplier};
-    if (bias_add_call) {
-      call_ext_args.push_back(bias);
-    }
-    call_ext_args.push_back(shift);
-    call_ext_args.push_back(output);
-
     tvm::Array<PrimExpr> scalar_args = {ToArg(input_offset), ToArg(output_offset), ToArg(stride_w),
                                         ToArg(stride_h),     ToArg(padding_w),     ToArg(padding_h),
                                         ToArg(dilation_w),   ToArg(dilation_h),    ToArg(clip_min),
@@ -173,18 +168,42 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> input_dims = CMSISNNDimensions(input_shape);
 
-    // cmsis_nn_dims *filter_dims (OHWI)
+    // cmsis_nn_dims *filter_dims (OHWI for Conv2D and IHWO for depthwise)
     Array<PrimExpr> filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> filter_dims = CMSISNNDimensions(filter_shape);
 
-    // cmsis_nn_dims *bias_dims (1,1,1,output_channels)
-    Array<PrimExpr> bias_shape{1, 1, 1, filter_shape[0]};
+    // cmsis_nn_dims *bias_dims
+    Array<PrimExpr> bias_shape{1, 1, 1, out_channels};
     Array<PrimExpr> bias_dims = CMSISNNDimensions(bias_shape);
 
-    // cmsis_nn_dims *output_dims (NHWC)
+    // cmsis_nn_dims *output_dims (same order as input_dims)
     Array<PrimExpr> output_shape = conv2d_call->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> output_dims = CMSISNNDimensions(output_shape);
 
+    int32_t depth_multiplier = -1;
+    int kernel_pos_o = kernel_layout.find("O");
+    if (groups == qnn::get_const_int(input_shape[3]) &&
+        groups == qnn::get_const_int(filter_shape[kernel_pos_o])) {
+      int kernel_pos_i = kernel_layout.find("I");
+      depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_i]);
+    }
+    scalar_args.push_back(ToArg(depth_multiplier));
+
+    // original filter_layout for depthwise is HWOI
+    std::string cmsisnn_api = "arm_convolve_wrapper_s8";
+    if (depth_multiplier != -1) {
+      cmsisnn_api = "arm_depthwise_conv_wrapper_s8";
+      Array<PrimExpr> depthwise_filter_shape{1, filter_shape[0], filter_shape[1], out_channels};
+      filter_dims = CMSISNNDimensions(depthwise_filter_shape);
+    }
+
+    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm(cmsisnn_api), input, filter, multiplier};
+    if (bias_add_call) {
+      call_ext_args.push_back(bias);
+    }
+    call_ext_args.push_back(shift);
+    call_ext_args.push_back(output);
+
     // https://github.com/ARM-software/CMSIS_5/blob/d788fd583984388553391de18afd8b4d2a146868/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c#L367
     std::string context_buffer_name = "NULL";
     size_t context_buffer_size =
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index bf56770257f74..95d0519db67ba 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -69,7 +69,8 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" ||
         cmsis_func_name == "arm_elementwise_add_s8") {
       CodeGenC::VisitExpr_(op, os);
-    } else if (cmsis_func_name == "arm_convolve_wrapper_s8") {
+    } else if (cmsis_func_name == "arm_convolve_wrapper_s8" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s8") {
       EmitConv2D(op);
     }
     return;
@@ -87,8 +88,12 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
   std::string EmitCMSISNNConvParams(std::ostream& os, int32_t input_offset, int32_t output_offset,
                                     int32_t stride_w, int32_t stride_h, int32_t padding_w,
                                     int32_t padding_h, int32_t dilation_w, int32_t dilation_h,
-                                    int32_t clip_min, int32_t clip_max) {
-    std::string struct_name = "conv_params";
+                                    int32_t clip_min, int32_t clip_max, int32_t depth_multiplier) {
+    std::string struct_name = "cmsis_nn_conv_params";
+    std::string instance_name = "conv_params";
+    if (depth_multiplier != -1) {
+      struct_name = "cmsis_nn_dw_conv_params";
+    }
     PrintIndent();
     os << "cmsis_nn_tile stride = {" << stride_w << "," << stride_h << "};\n";
     PrintIndent();
@@ -98,9 +103,12 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     PrintIndent();
     os << "cmsis_nn_activation activation = {" << clip_min << "," << clip_max << "};\n";
     PrintIndent();
-    os << "cmsis_nn_conv_params " << struct_name << " = {" << input_offset << ", " << output_offset
-       << ", stride, padding, dilation, activation};\n";
-    return struct_name;
+    os << struct_name << " " << instance_name << " = {" << input_offset << ", " << output_offset;
+    if (depth_multiplier != -1) {
+      os << ", " << depth_multiplier;
+    }
+    os << ", stride, padding, dilation, activation};\n";
+    return instance_name;
   }
 
   /*!  * \brief Emits cmsis_nn_per_channel_quant_params struct */
@@ -125,7 +133,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
 
   /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
   void EmitConv2D(const CallNode* op) {
-    static const int max_num_args = 35;
+    static const int max_num_args = 36;
     std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
 
     bool bias_enabled = false;
@@ -143,7 +151,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     std::string input_data = get_var_name(op, ++arg_id);
     std::string filter_data = get_var_name(op, ++arg_id);
     std::string multiplier = get_var_name(op, ++arg_id);
-    std::string bias_data("0x0");
+    std::string bias_data("NULL");
     if (bias_enabled) {
       bias_data = get_var_name(op, ++arg_id);
     }
@@ -162,6 +170,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     int dilation_h = get_arg_value(op, ++arg_id);
     int clip_min = get_arg_value(op, ++arg_id);
     int clip_max = get_arg_value(op, ++arg_id);
+    int depth_multiplier = get_arg_value(op, ++arg_id);
     int input_n = get_arg_value(op, ++arg_id);
     int input_h = get_arg_value(op, ++arg_id);
     int input_w = get_arg_value(op, ++arg_id);
@@ -180,9 +189,9 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     int output_c = get_arg_value(op, ++arg_id);
 
     std::string context = EmitCMSISNNContext(stream, context_buffer_name, context_buffer_size);
-    std::string conv_params =
-        EmitCMSISNNConvParams(stream, input_offset, output_offset, stride_w, stride_h, padding_w,
-                              padding_h, dilation_w, dilation_h, clip_min, clip_max);
+    std::string conv_params = EmitCMSISNNConvParams(
+        stream, input_offset, output_offset, stride_w, stride_h, padding_w, padding_h, dilation_w,
+        dilation_h, clip_min, clip_max, depth_multiplier);
     std::string quant_params = EmitCMSISNNPerChannelQuantParams(stream, multiplier, shift);
     std::string input_dim = EmitCMSISNNDims(stream, "input", input_n, input_h, input_w, input_c);
     std::string filter_dim =
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index c68dfb573839d..df6ae2edabc7a 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -71,21 +71,20 @@ def make_model(
     p = (0, 0, 0, 0)
     if padding == "SAME":
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
-        a = relay.nn.pad(
-            a,
+        invar = relay.nn.pad(
+            invar,
             pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
             pad_value=input_zero_point,
             pad_mode="constant",
         )
         shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     rng = np.random.default_rng(12321)
     w = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
-            size=weight_shape,
+            size=kernel_shape,
             dtype=kernel_dtype,
         )
     )
@@ -129,8 +128,11 @@ def make_model(
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))])
-@pytest.mark.parametrize("enable_bias", [True, False])
 @pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+@pytest.mark.parametrize(
+    "conv_type, depth_multiplier, enable_bias",
+    [("conv2d", 1, True), ("conv2d", 1, False), ("depthwise", 1, True), ("depthwise", 3, True)],
+)
 @pytest.mark.parametrize(
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
@@ -147,27 +149,32 @@ def test_op_int8(
     input_scale,
     kernel_scale,
     out_channels,
+    conv_type,
+    depth_multiplier,
 ):
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_CORSTONE300_RUNNER
 
-    kernel_zero_point = 0
+    dtype = "int8"
     groups = 1
     weight_format = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
-    dtype = "int8"
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
-    weight_shape = None
-    if weight_format == "HWIO":
-        weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
-    else:
-        weight_shape = (kernel_h, kernel_w, ifm_shape[3], out_channels)
+    if conv_type == "depthwise":
+        groups = ifm_shape[3]
+        weight_format = "HWOI"
+        kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
+        out_channels = ifm_shape[3] * depth_multiplier
+        ks_len = len(kernel_scale)
+        kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
     output_scale, output_zero_point = get_conv2d_qnn_params(
-        weight_shape,
+        kernel_shape,
         input_scale,
         input_zero_point,
         kernel_scale,
@@ -175,12 +182,12 @@ def test_op_int8(
         dtype,
         dtype,
         dtype,
-        False,
+        conv_type == "depthwise",
     )
 
     model, params = make_model(
         ifm_shape,
-        weight_shape,
+        kernel_shape,
         input_zero_point,
         input_scale,
         kernel_zero_point,
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 7af2a0ee99a3d..2ef84d7f1a6f8 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -416,7 +416,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
             for name in mlf_package.getnames()
             if re.match(r"\./codegen/host/src/\D+\d+\.c", name)
         ]
-        assert len(c_source_files) == 5
+        assert len(c_source_files) == 3
 
 
 @pytest.mark.skipif(