diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 4c5af610d709..7c48b09ff00d 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -276,13 +276,15 @@ def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     data, kernel = inputs
     if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.arm_cpu.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.arm_cpu.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.arm_cpu",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.x86",
         )
@@ -294,7 +296,9 @@ def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     """depthwise_conv2d_NCHWc adopted from x86"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.x86",
     )
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 9c4a896d572d..e3c74e15c2c0 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -316,10 +316,19 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
         ):
             assert kernel_layout == "OIHW4o4i"
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
+                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, need_data_layout=True),
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                 name="conv2d_NCHWc_int8.cuda",
             )
+        elif is_auto_scheduler_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    topi.nn.conv, need_data_layout=True, need_kernel_layout=True, has_groups=True
+                ),
+                naive_schedule,
+                name="conv2d.cuda",
+                plevel=15,
+            )
         elif target.kind.name == "cuda" and "cudnn" not in target.libs:
             # No TVM native kernel applicable
             raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 4ff7490b89ac..6074b0a69cc3 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -223,7 +223,9 @@ def schedule_bitpack(attrs, outs, target):
 # conv2d
 def wrap_compute_conv2d(
     topi_compute,
+    *,
     need_data_layout=False,
+    need_kernel_layout=False,
     need_out_layout=False,
     has_groups=False,
     need_auto_scheduler_layout=False,
@@ -236,6 +238,7 @@ def _compute_conv2d(attrs, inputs, out_type):
         strides = get_const_tuple(attrs.strides)
         dilation = get_const_tuple(attrs.dilation)
         data_layout = attrs.get_str("data_layout")
+        kernel_layout = attrs.get_str("kernel_layout")
         out_layout = attrs.get_str("out_layout")
         out_dtype = attrs.out_dtype
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
@@ -244,6 +247,8 @@ def _compute_conv2d(attrs, inputs, out_type):
             args.append(attrs.groups)
         if need_data_layout:
             args.append(data_layout)
+        if need_kernel_layout:
+            args.append(kernel_layout)
         if need_out_layout:
             args.append(out_layout)
         args.append(out_dtype)
@@ -340,13 +345,15 @@ def conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8":
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.nn.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.generic",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.generic",
         )
@@ -360,7 +367,9 @@ def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
     logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.")
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.nn.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.generic",
     )
diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py
index 1eebbd36b847..4a682066ca2e 100644
--- a/python/tvm/relay/op/strategy/hls.py
+++ b/python/tvm/relay/op/strategy/hls.py
@@ -137,7 +137,7 @@ def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target):
     """conv2d_NCHWc hls strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
         wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc),
         name="conv2d_NCHWc.hls",
     )
diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py
index a2de49c5579e..115a71114468 100644
--- a/python/tvm/relay/op/strategy/intel_graphics.py
+++ b/python/tvm/relay/op/strategy/intel_graphics.py
@@ -44,7 +44,9 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target):
             # conv2d_NCHWc won't work without alter op layout pass
             # TODO(@Laurawly): fix this
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+                wrap_compute_conv2d(
+                    topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+                ),
                 wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
                 name="conv2d_NCHWc.intel_graphics",
                 plevel=5,
@@ -71,7 +73,9 @@ def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target):
     """conv2d_NCHWc intel_graphics strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
         name="conv2d_NCHWc.intel_graphics",
     )
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 6e91101826c9..89cac0db4ab9 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -44,7 +44,7 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
         and padding[1] == padding[3]
     ):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
+            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, need_data_layout=True),
             wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
             name="conv2d_nchw_miopen.rocm",
             plevel=50,
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index abbc9d9a4c57..17474020eefe 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -269,13 +269,15 @@ def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
     data, kernel = inputs
     if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.x86.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.x86",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.x86",
         )
@@ -287,7 +289,9 @@ def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
     """depthwise_conv2d x86 strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.x86",
     )
diff --git a/python/tvm/topi/nn/conv1d.py b/python/tvm/topi/nn/conv1d.py
index 0a1efa35655f..ee388b4297f4 100644
--- a/python/tvm/topi/nn/conv1d.py
+++ b/python/tvm/topi/nn/conv1d.py
@@ -19,18 +19,27 @@
 from .conv2d import conv
 
 
-def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", out_dtype=None):
+def conv1d(
+    data,
+    kernel,
+    strides=1,
+    padding="VALID",
+    dilation=1,
+    data_layout="NCW",
+    kernel_layout="",
+    out_dtype=None,
+):
     """1D convolution forward operator.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        3-D input shape [batch, in_channel, in_width] for layout == 'NCW'
-        and [batch, in_width, in_channel] for layout == 'NWC'
+        3-D input shape [batch, in_channel, in_width] for data_layout == 'NCW'
+        and [batch, in_width, in_channel] for data_layout == 'NWC'
 
     kernel : tvm.te.Tensor
-        3-D kernel with shape [num_filter, in_channel, filter_size] for layout == 'NCW'
-        and [filter_size, in_channel, num_filter] for layout == 'NWC'
+        3-D kernel with shape [num_filter, in_channel, filter_size] for kernel_layout == 'OIW'
+        and [filter_size, in_channel, num_filter] for kernel_layout == 'WIO'
 
     strides : int or tuple
         The spatial stride along width
@@ -41,23 +50,27 @@ def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", o
     dilation : int or tuple
         Dilation rate if convolution should be dilated.
 
-    layout : str
+    data_layout : str
         How input data is laid out, must be one of ['NCW', 'NWC']
 
+    kernel_layout: Optiona[str]
+        The layout of the kernel. If unspecified, use default layout. "OIW" if data_layout == "NCW",
+        "WIO" if data_layout == "NWC".
+
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, 1, layout, out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, data_layout, kernel_layout, out_dtype)
 
 
 def conv1d_nwc(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
     """1D convolution in NWC layout. See :py:func:`conv` for details on parameters"""
-    return conv(data, kernel, strides, padding, dilation, 1, "NWC", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, "NWC", "WIO", out_dtype=out_dtype)
 
 
 def conv1d_ncw(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
     """1D convolution in NCW layout. See :py:func:`conv` for details on parameters"""
-    return conv(data, kernel, strides, padding, dilation, 1, "NCW", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, "NCW", "OIW", out_dtype=out_dtype)
 
 
 def group_conv1d_nwc(
@@ -89,7 +102,7 @@ def group_conv1d_nwc(
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, groups, "NWC", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, groups, "NWC", "WIO", out_dtype=out_dtype)
 
 
 def group_conv1d_ncw(
@@ -121,4 +134,4 @@ def group_conv1d_ncw(
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, groups, "NCW", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, groups, "NCW", "OIW", out_dtype=out_dtype)
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index d23b8d857e4e..5070c84c7e51 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -57,16 +57,18 @@
 )
 
 
-def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=None):
+def conv2d(
+    input, filter, strides, padding, dilation, data_layout="NCHW", kernel_layout="", out_dtype=None
+):
     """Conv2D operator.
 
     Parameters
     ----------
     input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width] in data_layout
 
     filter : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] in kernel_layout
 
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -79,9 +81,13 @@ def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=N
     dilation: int or a list/tuple of two ints
         dilation size, or [dilation_height, dilation_width]
 
-    layout : str
+    data_layout : str
         layout of data
 
+    kernel_layout : Optional[str]
+        layout of kernel. If unspecified, use default layout inferred from data_layout. "OIHW" if
+        data_layout == "NCHW", "HWIO" if data_layout == "NHWC".
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -89,7 +95,7 @@ def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=N
     """
     # search platform specific declaration first
     # default declaration
-    return conv(input, filter, strides, padding, dilation, 1, layout, out_dtype)
+    return conv(input, filter, strides, padding, dilation, 1, data_layout, kernel_layout, out_dtype)
 
 
 @tvm.target.generic_func
@@ -239,7 +245,7 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, 1, "NCHW", out_dtype=out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, 1, "NCHW", "OIHW", out_dtype=out_dtype)
 
 
 def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -269,7 +275,7 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     output : tvm.te.Tensor
         4-D with shape [out_height, out_width, out_channel, batch]
     """
-    return conv(Input, Filter, stride, padding, dilation, 1, "HWCN", out_dtype=out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, 1, "HWCN", "HWIO", out_dtype=out_dtype)
 
 
 def conv2d_nhwc(
@@ -325,6 +331,7 @@ def conv2d_nhwc(
         dilation,
         1,
         "NHWC",
+        "HWIO",
         out_dtype,
         auto_scheduler_rewritten_layout,
         meta_schedule_original_shape,
@@ -708,7 +715,9 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NCHW", out_dtype=out_dtype)
+    return conv(
+        Input, Filter, stride, padding, dilation, groups, "NCHW", "OIHW", out_dtype=out_dtype
+    )
 
 
 def conv(
@@ -718,7 +727,8 @@ def conv(
     padding: Union[int, Sequence[int]],
     dilation: Union[int, Sequence[int]],
     groups: int,
-    order: str,
+    data_layout: str,
+    kernel_layout: str = "",
     out_dtype: Union[str, None] = None,
     auto_scheduler_rewritten_layout: Optional[str] = None,
     meta_schedule_original_shape=None,
@@ -731,11 +741,11 @@ def conv(
     Parameters
     ----------
     inp : tvm.te.Tensor
-        N-D with shape [batch, in_channel, in_height, in_width, ...] ordered by `order`
+        N-D with shape [batch, in_channel, in_height, in_width, ...] in `data_layout`
 
     filt : tvm.te.Tensor
-        N-D with shape [num_filter, in_channel // groups, filter_height, filter_width, ...]
-        for NCHW or [filter_height, filter_width, ..., in_channel // groups, num_filter] for NHWC
+        N-D with shape [num_filter, in_channel // groups, filter_height, filter_width, ...] in
+        `kernel_layout`
 
     stride : int or a list/tuple of dim ints
         (where dim=2 for NCHW, dim=1 for NCH, etc.)
@@ -753,10 +763,16 @@ def conv(
     groups : int
         number of groups
 
-    order : str
-        Ordering of dimensions. N indicates batch dimension, C indicates
+    data_layout : str
+        Layout of the input. N indicates batch dimension, C indicates
         channels, any other character indicates HW (or H or HWD for 1D and 3D).
 
+    kernel_layout: Optional[str]
+        Layout of the filter. I indicates input channels, O indicates output channels,
+        any other character indicates HW dimension of the filter (or H or HWD for 1D and 3D).
+        If kernel_layout is empty, use data_layout to infer the default kernel_layout. Default
+        kernel_layout is OIHW for NCHW data layout, HWIO for NHWC data layout.
+
     out_dtype : str
         Elements are converted to this type before elementwise multiplication
         and summation.
@@ -775,7 +791,7 @@ def conv(
     Returns
     -------
     Output : tvm.te.Tensor
-        N-D with shape [batch, out_channel, out_height, out_width, ...] ordered by `order`.
+        N-D with shape [batch, out_channel, out_height, out_width, ...] in `data_layout`
     """
     dim = len(inp.shape) - 2
     if out_dtype is None:
@@ -792,30 +808,41 @@ def conv(
     else:
         dilations = list(dilation)
 
-    # transform from order to NCHW
-    permutation_to = [order.find("N"), order.find("C")] + [
-        x.span()[0] for x in re.finditer("[^NC]", order)
+    # transform from data_layout to NCHW
+    data_permutation_to = [data_layout.find("N"), data_layout.find("C")] + [
+        x.span()[0] for x in re.finditer("[^NC]", data_layout)
     ]
-    # transform from NCHW to order
-    permutation_from = np.argsort(permutation_to)
-    # transform from CHW to order
-    permutation_from_reductions = permutation_from[1:].copy()
-    permutation_from_reductions[permutation_from_reductions > permutation_from[0]] -= 1
-
-    # kernel permutation, if C appears before HW then num_filter is first, otherwise it is last
-    # tkonolige: I don't really understand kernel ordering for NHWC, it seems
-    # like num_filters should match the N dimension
-    if order.find("C") < re.search("[^NC]", order).span()[0]:
-        permutation_to_kernel = [0, 1] + list(range(2, dim + 2))
+    # transform from NCHW to data_layout
+    data_permutation_from = np.argsort(data_permutation_to)
+    # transform from CHW to data_layout
+    data_permutation_from_reductions = data_permutation_from[1:].copy()
+    data_permutation_from_reductions[
+        data_permutation_from_reductions > data_permutation_from[0]
+    ] -= 1
+
+    if kernel_layout == "":
+        # kernel permutation, if C appears before HW then num_filter is first, otherwise it is last
+        # tkonolige: I don't really understand kernel ordering for NHWC, it seems
+        # like num_filters should match the N dimension
+        if data_layout.find("C") < re.search("[^NC]", data_layout).span()[0]:
+            kernel_permutation_to = [0, 1] + list(range(2, dim + 2))
+        else:
+            kernel_permutation_to = [dim + 1, dim] + list(range(dim))
     else:
-        permutation_to_kernel = [dim + 1, dim] + list(range(dim))
-    permutation_from_kernel = np.argsort(permutation_to_kernel)
+        # transform from kernel_layout to OIHW
+        kernel_permutation_to = [kernel_layout.find("O"), kernel_layout.find("I")] + [
+            x.span()[0] for x in re.finditer("[^OI]", kernel_layout)
+        ]
+    # transform from OIHW to kernel_layout
+    kernel_permutation_from = np.argsort(kernel_permutation_to)
 
     if meta_schedule_original_shape:
         auto_scheduler.rewrite_tensor_shape(filt, meta_schedule_original_shape)
-    batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[permutation_to].tolist()
+    batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[
+        data_permutation_to
+    ].tolist()
     num_filter, _, *kernel_dimensions = np.array(get_const_tuple(filt.shape))[
-        permutation_to_kernel
+        kernel_permutation_to
     ].tolist()
 
     # Autoscheduler may have messed with the input layout, so we extract the
@@ -841,14 +868,14 @@ def conv(
         )
     ]
     # compute graph
-    pad_before = list(np.array([0, 0] + pad_begin)[permutation_from])
-    pad_after = list(np.array([0, 0] + pad_end)[permutation_from])
+    pad_before = list(np.array([0, 0] + pad_begin)[data_permutation_from])
+    pad_after = list(np.array([0, 0] + pad_end)[data_permutation_from])
     temp = pad(inp, pad_before, pad_after, name="pad_temp")
     rc = te.reduce_axis((0, in_channel // groups), name="rc")
     rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in zip(["y", "x", "z"], kernel_dimensions)]
 
     def compute(*args):
-        nn, ff, *dim_indices = list(np.array(args)[permutation_to])
+        nn, ff, *dim_indices = list(np.array(args)[data_permutation_to])
 
         if groups == 1:
             simplified_channel_index = rc
@@ -864,25 +891,25 @@ def compute(*args):
                             di * stride + r * dil
                             for di, stride, r, dil in zip(dim_indices, strides, rs, dilations)
                         ]
-                    )[permutation_from]
+                    )[data_permutation_from]
                 )
             ).astype(out_dtype)
-            * filt.__getitem__(tuple(np.array([ff, rc] + rs)[permutation_from_kernel])).astype(
+            * filt.__getitem__(tuple(np.array([ff, rc] + rs)[kernel_permutation_from])).astype(
                 out_dtype
             ),
             # Schedules depend on reduction axes being in the same order as the
             # layout, so we reorder here.
-            axis=np.array([rc, *rs])[permutation_from_reductions].tolist(),
+            axis=np.array([rc, *rs])[data_permutation_from_reductions].tolist(),
         )
 
     out = te.compute(
-        list(np.array([batch, out_channel] + out_dimensions)[permutation_from]),
+        list(np.array([batch, out_channel] + out_dimensions)[data_permutation_from]),
         compute,
         # tag is expected to be lowercase
-        tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
-        name=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
+        tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{data_layout.lower()}",
+        name=f"{'group_' if groups > 1 else ''}conv{dim}d_{data_layout.lower()}",
         attrs={"layout_free_placeholders": [filt]} if auto_scheduler_should_rewrite_layout else {},
-        varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[permutation_from]),
+        varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[data_permutation_from]),
     )
     # if we used autoscheduler's changed layout we need to rewrite the ordering
     # of the output dimensions
@@ -924,7 +951,9 @@ def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NHWC", out_dtype=out_dtype)
+    return conv(
+        Input, Filter, stride, padding, dilation, groups, "NHWC", "HWIO", out_dtype=out_dtype
+    )
 
 
 def unpack_NCHWc_to_nchw(packed_out, out_dtype):
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index 591c643a95c2..1897484dc8cd 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -53,7 +53,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, groups, out_dtype=Non
     Output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NCDHW", out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, groups, "NCDHW", "OIDHW", out_dtype)
 
 
 def conv3d_ndhwc(
@@ -111,6 +111,7 @@ def conv3d_ndhwc(
         dilation,
         groups,
         "NDHWC",
+        "DHWIO",
         out_dtype,
         auto_scheduler_rewritten_layout,
         meta_schedule_origin_shape,
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index b088b350c9f0..9d9f4e10e646 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -86,7 +86,7 @@ def check_device(device):
                 stride,
                 padding,
                 dilation,
-                layout="NCHW",
+                data_layout="NCHW",
                 out_dtype=dtype,
             )
             if add_bias:
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 362de3a76909..e60cf12aa83e 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -77,7 +77,7 @@ def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padd
     return a_np, w_np, b_np
 
 
-def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
+def test_conv2d_nhwc_hwio(target, dev, ref_data, dtype, stride, padding, dilation):
     a_np, w_np, b_np = ref_data
 
     A = te.placeholder(a_np.shape, name="A", dtype=dtype)
@@ -95,5 +95,34 @@ def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
     tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
+def test_conv2d_nhwc_ohwi(ref_data, dtype, stride, padding, dilation):
+    # only test on CPU target because topi doesn't have schedules for this layout
+    target = "llvm"
+    dev = tvm.device(target, 0)
+    a_np, w_np_hwio, b_np = ref_data
+    w_np_ohwi = w_np_hwio.transpose(3, 0, 1, 2)  # HWIO -> OHWI
+
+    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+    W = te.placeholder(w_np_ohwi.shape, name="W", dtype=dtype)
+
+    B = topi.nn.conv2d(
+        A,
+        W,
+        stride,
+        padding,
+        dilation,
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        out_dtype="float32",
+    )
+    s = tvm.te.create_schedule(B.op)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np_ohwi, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    func = tvm.build(s, [A, W, B], target)
+    func(a, w, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 6b06d88096bf..4fa5b6ff8438 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -214,7 +214,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
             assert kernel.dtype == "int8"
 
             strategy.add_implementation(
-                _strategy.wrap_compute_conv2d(conv2d_packed, True),
+                _strategy.wrap_compute_conv2d(conv2d_packed, need_data_layout=True),
                 _strategy.wrap_topi_schedule(schedule_conv2d_packed),
                 name="conv2d_packed.vta",
             )