diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index d84fe40d99..e6d07610c8 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -10,8 +10,8 @@
     NodeVisitor,
     register_node_visitor,
 )
-from executorch.backends.arm.operators.op_common import build_avg_pool_2d_common
 from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_common.py b/backends/arm/operators/op_common.py
deleted file mode 100644
index eadf00c294..0000000000
--- a/backends/arm/operators/op_common.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import serializer.tosa_serializer as ts
-import torch
-from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_quant_utils import get_quant_node_args
-from serializer.tosa_serializer import TosaOp
-
-
-def build_avg_pool_2d_common(
-    node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
-    input_tensor: TosaArg,
-    kernel_size: list,
-    stride: list,
-    padding: list,
-    is_quant_node: bool,
-    output: TosaArg,
-):
-    accumulator_type = input_tensor.dtype
-
-    if is_quant_node:
-        # Accumulator type always is int32 when input tensor is an integer type.
-        accumulator_type = ts.DType.INT32
-
-    # Initilize zero point to zero.
-    input_zp = 0
-    output_zp = 0
-
-    if is_quant_node:
-        input_zp = get_quant_node_args(node.args[0]).zp
-        output_zp = get_quant_node_args(list(node.users)[0]).zp
-
-    attr = ts.TosaSerializerAttribute()
-    attr.PoolAttribute(
-        kernel=kernel_size,
-        stride=stride,
-        pad=padding,
-        input_zp=input_zp,
-        output_zp=output_zp,
-        accum_dtype=accumulator_type,
-    )
-
-    tosa_graph.addOperator(
-        TosaOp.Op().AVG_POOL2D,
-        [input_tensor.name],
-        [output.name],
-        attr,
-    )
diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py
index 5e8e3d74c0..20e1b2b8d7 100644
--- a/backends/arm/operators/op_mean_dim.py
+++ b/backends/arm/operators/op_mean_dim.py
@@ -10,8 +10,8 @@
     NodeVisitor,
     register_node_visitor,
 )
-from executorch.backends.arm.operators.op_common import build_avg_pool_2d_common
 from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common
 
 
 @register_node_visitor
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 68d090653a..a692b3a270 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -6,11 +6,12 @@
 import logging
 import os
 
-import executorch.backends.arm.tosa_quant_utils as tosa_quant_utils
-
 import numpy as np
 import serializer.tosa_serializer as ts
+import torch
 from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import get_quant_node_args, q_op
 from executorch.exir.dialects._ops import ops as exir_ops
 from serializer.tosa_serializer import TosaOp
 
@@ -158,7 +159,7 @@ def is_bias_node_for_addmm(node):
     # consumer node is addmm
     is_rank2_linear_bias = (
         consumer_node.target == exir_ops.edge.aten.addmm.default
-        and list(consumer_node.users)[0].target == tosa_quant_utils.q_op
+        and list(consumer_node.users)[0].target == q_op
     )
 
     # rank>2 linear layers
@@ -170,7 +171,7 @@ def is_bias_node_for_addmm(node):
     ):
         consumer_consumer_node = list(consumer_node.users)[0]
         is_rank_greater_than_2_linear_bias = (
-            list(consumer_consumer_node.users)[0].target == tosa_quant_utils.q_op
+            list(consumer_consumer_node.users)[0].target == q_op
         )
 
     return is_rank2_linear_bias or is_rank_greater_than_2_linear_bias
@@ -189,3 +190,45 @@ def is_consumer_node_depthwise_conv2d(node):
             return True
 
     return False
+
+
+def build_avg_pool_2d_common(
+    node: torch.fx.Node,
+    tosa_graph: ts.TosaSerializer,
+    input_tensor: TosaArg,
+    kernel_size: list,
+    stride: list,
+    padding: list,
+    is_quant_node: bool,
+    output: TosaArg,
+):
+    accumulator_type = input_tensor.dtype
+
+    if is_quant_node:
+        # Accumulator type always is int32 when input tensor is an integer type.
+        accumulator_type = ts.DType.INT32
+
+    # Initilize zero point to zero.
+    input_zp = 0
+    output_zp = 0
+
+    if is_quant_node:
+        input_zp = get_quant_node_args(node.args[0]).zp
+        output_zp = get_quant_node_args(list(node.users)[0]).zp
+
+    attr = ts.TosaSerializerAttribute()
+    attr.PoolAttribute(
+        kernel=kernel_size,
+        stride=stride,
+        pad=padding,
+        input_zp=input_zp,
+        output_zp=output_zp,
+        accum_dtype=accumulator_type,
+    )
+
+    tosa_graph.addOperator(
+        TosaOp.Op().AVG_POOL2D,
+        [input_tensor.name],
+        [output.name],
+        attr,
+    )
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 90d858f15c..78984f34e5 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -4,14 +4,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-strict
+
 import logging
-from typing import Any, Dict, final, List, Optional
+from typing import Any, Dict, final, List, Mapping, Optional
 
 import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
 
 import torch
 
-from executorch.backends.vulkan.partitioner.supported_ops import enumerate_supported_ops
+from executorch.backends.vulkan.partitioner.supported_ops import (
+    enumerate_supported_ops,
+    OpList,
+)
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
@@ -30,12 +35,13 @@
 
 
 class VulkanSupportedOperators(OperatorSupportBase):
-    _ops = enumerate_supported_ops()
+    _ops: OpList = enumerate_supported_ops()
 
-    def __init__(self, require_dynamic_shape: bool = False):
+    def __init__(self, require_dynamic_shape: bool = False) -> None:
         super().__init__()
         self.require_dynamic_shapes = require_dynamic_shape
 
+    # pyre-ignore
     def node_val_is_compatible(self, node_val: Any) -> bool:
         # Skip nodes that don't have a value
         if node_val is None:
@@ -94,7 +100,17 @@ def is_linear_permute(self, node: torch.fx.Node) -> bool:
 
         return False
 
-    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        r = self._is_node_supported(submodules, node)
+        if not r and node.op == "call_function":
+            logging.info(f"Skipping node in Vulkan partitioning: {node.format_node()}")
+        return r
+
+    def _is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
         if self.is_linear_permute(node):
             return True
 
diff --git a/backends/vulkan/runtime/api/QueryPool.cpp b/backends/vulkan/runtime/api/QueryPool.cpp
index e11c44c357..ec6e15404c 100644
--- a/backends/vulkan/runtime/api/QueryPool.cpp
+++ b/backends/vulkan/runtime/api/QueryPool.cpp
@@ -199,19 +199,22 @@ std::string QueryPool::generate_string_report() {
   std::stringstream ss;
 
   int kernel_name_w = 40;
-  int global_size_w = 15;
+  int global_size_w = 25;
+  int local_size_w = 25;
   int duration_w = 25;
 
   ss << std::left;
   ss << std::setw(kernel_name_w) << "Kernel Name";
-  ss << std::setw(global_size_w) << "Workgroup Size";
+  ss << std::setw(global_size_w) << "Global Workgroup Size";
+  ss << std::setw(local_size_w) << "Local Workgroup Size";
   ss << std::right << std::setw(duration_w) << "Duration (ns)";
   ss << std::endl;
 
   ss << std::left;
   ss << std::setw(kernel_name_w) << "===========";
-  ss << std::setw(global_size_w) << "==============";
-  ss << std::right << std::setw(duration_w) << "===========";
+  ss << std::setw(global_size_w) << "=====================";
+  ss << std::setw(local_size_w) << "====================";
+  ss << std::right << std::setw(duration_w) << "=============";
   ss << std::endl;
 
   for (ShaderDuration& entry : shader_durations_) {
@@ -221,6 +224,7 @@ std::string QueryPool::generate_string_report() {
     ss << std::left;
     ss << std::setw(kernel_name_w) << entry.kernel_name;
     ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
+    ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
     ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
     ss << std::endl;
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 471fd2c007..21493833e9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -99,14 +99,11 @@ ValueRef prepack_biases(
 
   api::ShaderInfo shader = get_nchw_to_tensor_shader(*t);
 
-  api::utils::uvec3 global_size = t->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.prepack_nodes().emplace_back(new PrepackNode(
       graph,
       shader,
-      global_size,
-      local_size,
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
       vref,
       v,
       {t->sizes_ubo()},
@@ -203,17 +200,14 @@ ValueRef prepack_weights(
       final_sizes, graph.dtype_of(vref), api::kTexture2D, api::kChannelsPacked);
   vTensorPtr t = graph.get_tensor(v);
 
-  api::utils::uvec3 global_size = t->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   api::ShaderInfo shader =
       get_conv2d_shader(graph, *t, /*prepack_weights = */ true, method, vref);
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
       graph,
       shader,
-      global_size,
-      local_size,
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
       vref,
       v,
       {t->sizes_ubo(),
@@ -343,9 +337,6 @@ void add_conv2d_node(
   }
   check_conv_args(*t_in, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   Kernel2dParams kernel_params = create_kernel2d_params(
       graph,
       weight,
@@ -366,8 +357,8 @@ void add_conv2d_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       shader,
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 412dbe9df6..d42be482a6 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import ctypes
 import unittest
 from typing import Tuple
@@ -117,7 +119,9 @@ def run_test(memory_layout):
             program: ExportedProgram = export(
                 model, sample_inputs, dynamic_shapes=dynamic_shapes
             )
-            edge_program: EdgeProgramManager = to_edge(program)
+            edge_program: EdgeProgramManager = to_edge(
+                program, compile_config=self._edge_compile_config
+            )
 
             edge_program = edge_program.transform([I64toI32(), MeanToSumDiv()])
 
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 91174c08f7..80abd46409 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -262,14 +262,6 @@ deps = [
   "executorch_no_prim_ops",
 ]
 
-[targets.xnnpack_dynamic_quant_utils]
-buck_targets = [
-  "//backends/xnnpack:dynamic_quant_utils",
-]
-filters = [
-  ".cpp$",
-]
-
 [targets.xnnpack_schema]
 buck_targets = [
   "//backends/xnnpack/serialization:xnnpack_flatbuffer_header",
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index dd0ed94094..d959f10403 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
@@ -38,7 +40,8 @@ class EdgeCompileConfig:
     _use_edge_ops: bool = True
     _skip_type_promotion: bool = False
     # TODO(gasoonjia): remove this
-    _skip_dim_order: bool = False
+    # TODO(T192537614): reenanle dim order as default
+    _skip_dim_order: bool = True
 
 
 @compatibility(is_backward_compatible=False)
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 06e9e589e6..ca8ffde813 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pye-strict
+# pyre-unsafe
 
 import typing
 import unittest
@@ -866,7 +866,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Success if you use dim_order
         to_edge(
             export(model, inputs),
-            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+            compile_config=exir.EdgeCompileConfig(
+                _check_ir_validity=False, _skip_dim_order=False
+            ),
         ).to_executorch()
 
     def test_emit_multiple_entry_points(self) -> None:
diff --git a/exir/sym_util.py b/exir/sym_util.py
index 2a55d51a81..64f4b64a32 100644
--- a/exir/sym_util.py
+++ b/exir/sym_util.py
@@ -4,7 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List, Optional, Set, Union
+# pyre-strict
+
+from typing import Iterable, List, Optional, Set, Union
 
 import sympy
 
@@ -39,12 +41,14 @@ def eval_upper_bound(maybe_symint: Union[int, torch.SymInt]) -> int:
     node = maybe_symint.node
     shape_env = node.shape_env
     expr = node.expr
-    var_range: ValueRanges = bound_sympy(expr, shape_env.var_to_range)
+    var_range: ValueRanges = bound_sympy(  # pyre-ignore[24]
+        expr, shape_env.var_to_range
+    )
     upper_bound = var_range.upper
     # This import is needed temporarily until we update the pinned torch version.
 
     try:
-        from torch.utils._sympy.numbers import int_oo  # @manual  # pyre-ignore
+        from torch.utils._sympy.numbers import int_oo  # @manual
     except ImportError:
         int_oo = None
 
@@ -54,15 +58,15 @@ def eval_upper_bound(maybe_symint: Union[int, torch.SymInt]) -> int:
             concrete_upper, int
         ), f"Expect upper bound to be a concrete int but got {concrete_upper}"
         return concrete_upper
-    elif int_oo is not None and upper_bound is int_oo:  # pyre-ignore
-        return int_oo  # pyre-ignore
+    elif int_oo is not None and upper_bound is int_oo:
+        return int_oo
     else:
         raise RuntimeError(
             f"Expect upper bound to be sympy.Integer or int_oo. but got {upper_bound}"
         )
 
 
-def eval_shape(shape):
+def eval_shape(shape: Iterable[Union[int, torch.SymInt]]):  # pyre-ignore[3]
     """
     Shape maybe immutable so we return a new shape. Return None for
     dimensions that are unbacked e.g. first dimension of nonzero's output.
@@ -73,14 +77,16 @@ def eval_shape(shape):
     return new_shape
 
 
-def eval_shape_upper_bound(shape) -> List[int]:
+def eval_shape_upper_bound(shape: Iterable[Union[int, torch.SymInt]]) -> List[int]:
     new_shape = []
     for _, s in enumerate(shape):
         new_shape.append(eval_upper_bound(s))
     return new_shape
 
 
-def collect_free_symbols(shape) -> Set[sympy.Symbol]:
+def collect_free_symbols(
+    shape: Iterable[Union[int, torch.SymInt]]
+) -> Set[sympy.Symbol]:
     symset = set()
     for sz in shape:
         if not isinstance(sz, torch.SymInt):
diff --git a/exir/tests/test_memory_format_ops_pass_utils.py b/exir/tests/test_memory_format_ops_pass_utils.py
index 6a97fb96ea..93d790d491 100644
--- a/exir/tests/test_memory_format_ops_pass_utils.py
+++ b/exir/tests/test_memory_format_ops_pass_utils.py
@@ -4,12 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
 from dataclasses import dataclass
 from typing import Any, Tuple
 
 import torch
 from executorch.exir import to_edge
+from executorch.exir.capture._config import EdgeCompileConfig
 
 from executorch.exir.dim_order_utils import (
     is_channel_last_dim_order,
@@ -70,7 +73,7 @@ def memory_format_test_runner(
             edge_op_str
         ).run(before.graph_module.code)
 
-        epm = to_edge(before)
+        epm = to_edge(before, compile_config=EdgeCompileConfig(_skip_dim_order=False))
 
         # check op strings
         FileCheck().check_not(aten_op_str).check_count(
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 292cdfd471..61d3af8afb 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1134,7 +1134,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         add = Add()
 
-        edge = to_edge(export(add, (torch.ones(1),)))
+        edge = to_edge(
+            export(add, (torch.ones(1),)),
+            compile_config=EdgeCompileConfig(_skip_dim_order=False),
+        )
         edge = edge.transform([ScalarToTensorPass(), RemoveMixedTypeOperators()])
         exported_program = lift_constant_tensor_pass(edge.exported_program())
 
diff --git a/exir/verification/test/test_verifier.py b/exir/verification/test/test_verifier.py
index eaf05c6b63..b2e31dbc59 100644
--- a/exir/verification/test/test_verifier.py
+++ b/exir/verification/test/test_verifier.py
@@ -4,8 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
 from contextlib import contextmanager
+from typing import Any
 
 import torch
 from executorch.exir import EdgeCompileConfig, to_edge
@@ -20,7 +23,7 @@
 
 class TestEdgeDialectVerifier(unittest.TestCase):
     @contextmanager
-    def assertNotRaises(self, exc_type):
+    def assertNotRaises(self, exc_type: Any) -> Any:
         try:
             yield None
         except exc_type:
@@ -81,8 +84,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         export_model = export(m, example_input)
 
-        # In default we use dim order.
-        compile_config_without_edge_op = EdgeCompileConfig(_use_edge_ops=False)
+        compile_config_without_edge_op = EdgeCompileConfig(
+            _use_edge_ops=False, _skip_dim_order=False
+        )
 
         edge_manager = to_edge(
             export_model, compile_config=compile_config_without_edge_op
@@ -128,8 +132,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         export_model = export(m, example_input)
 
-        # In default we use dim order.
-        compile_config_with_dim_order = EdgeCompileConfig()
+        compile_config_with_dim_order = EdgeCompileConfig(_skip_dim_order=False)
         compile_config_with_stride = EdgeCompileConfig(_skip_dim_order=True)
 
         dim_order_edge_model = to_edge(
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
index 5310feaaef..0722a733cb 100644
--- a/kernels/quantized/cpu/op_dequantize.cpp
+++ b/kernels/quantized/cpu/op_dequantize.cpp
@@ -196,8 +196,8 @@ Tensor& dequantize_per_channel_out(
       "Failed to resize out Tensor in dequantize_per_channel_out");
 
   ET_CHECK_MSG(
-      scale.scalar_type() == ScalarType::Double,
-      "scale.scalar_type() %" PRId8 " is not double type",
+      scale.scalar_type() == ScalarType::Float,
+      "scale.scalar_type() %" PRId8 " is not float type",
       static_cast<int8_t>(scale.scalar_type()));
 
   ET_CHECK_MSG(
@@ -224,15 +224,15 @@ Tensor& dequantize_per_channel_out(
       input, quant_min, quant_max, dtype, out_dtype, out);
 
   // a list contains all dimensions except axis
-  int64_t dims[input.dim() - 1];
+  int64_t dims[kTensorDimensionLimit];
   for (int64_t i = 0; i < input.dim() - 1; i++) {
     if (i < axis) {
       dims[i] = i;
     } else {
-      dims[i] = i - 1;
+      dims[i] = i + 1;
     }
   }
-  const double* scale_data = scale.const_data_ptr<double>();
+  const float* scale_data = scale.const_data_ptr<float>();
   const int64_t* zero_point_data;
   if (opt_zero_points.has_value()) {
     zero_point_data = opt_zero_points.value().const_data_ptr<int64_t>();
@@ -253,8 +253,34 @@ Tensor& dequantize_per_channel_out(
   //   in other words you are dequantizing in_data[in_ix]
 #define DEQUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                        \
   case ScalarType::out_dtype:                                                  \
+    if (input.dim() == 1) {                                                    \
+      auto* out_data_ptr = out.mutable_data_ptr<CTYPE_OUT>();                  \
+      const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();           \
+      ET_CHECK_MSG(                                                            \
+          axis == 0, "Axis must be 0 for a single dimensional tensors");       \
+      const optional<int64_t> dim;                                             \
+      apply_over_dim(                                                          \
+          [input_data_ptr, out_data_ptr, scale_data, zero_point_data](         \
+              size_t numel, size_t stride, size_t base_ix) {                   \
+            for (size_t i = 0; i < numel; i++) {                               \
+              size_t current_ix = base_ix * stride + i;                        \
+              float _scale = scale_data[current_ix];                           \
+              int64_t zero_point = 0;                                          \
+              if (zero_point_data != nullptr) {                                \
+                zero_point = zero_point_data[current_ix];                      \
+              }                                                                \
+              out_data_ptr[current_ix] =                                       \
+                  static_cast<CTYPE_OUT>(                                      \
+                      input_data_ptr[current_ix] - zero_point) *               \
+                  _scale;                                                      \
+            }                                                                  \
+          },                                                                   \
+          input,                                                               \
+          dim);                                                                \
+      break;                                                                   \
+    }                                                                          \
     for (size_t channel_ix = 0; channel_ix < input.size(axis); ++channel_ix) { \
-      double _scale = scale_data[channel_ix];                                  \
+      float _scale = scale_data[channel_ix];                                   \
       int64_t _zero_point = 0;                                                 \
       if (zero_point_data != nullptr) {                                        \
         _zero_point = zero_point_data[channel_ix];                             \
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
index 1004126d04..4cdebb662b 100644
--- a/kernels/quantized/test/op_dequantize_test.cpp
+++ b/kernels/quantized/test/op_dequantize_test.cpp
@@ -116,11 +116,11 @@ TEST(OpDequantizeOutTest, TensorArgOverload) {
 
 TEST(OpDequantizeOutTest, DequantizePerChannel) {
   TensorFactory<ScalarType::Byte> tf_byte;
-  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
   Tensor input = tf_byte.full({3, 2}, 100);
-  Tensor scale = tf_double.make({2}, {0.5, 1});
+  Tensor scale = tf_float.make({2}, {0.5, 1});
   Tensor zero_point = tf_long.make({2}, {30, 60});
   int64_t quant_min = 0;
   int64_t quant_max = 255;
@@ -145,7 +145,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
 
   // Test with a different axis
   out = tfo.zeros({3, 2});
-  scale = tf_double.make({3}, {0.5, 0.75, 1});
+  scale = tf_float.make({3}, {0.5, 0.75, 1});
   zero_point = tf_long.make({3}, {30, 50, 60});
   // (100 - 30) * 0.5
   // (100 - 50) * 0.75
@@ -163,4 +163,25 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
+
+  // Test with a different axis
+  out = tfo.zeros({3});
+  input = tf_byte.make({3}, {100, 100, 100});
+  scale = tf_float.make({3}, {0.5, 0.75, 1});
+  zero_point = tf_long.make({3}, {30, 50, 60});
+  // (100 - 30) * 0.5
+  // (100 - 50) * 0.75
+  // (100 - 60) * 1
+  expected = tfo.make({3}, {35, 37.5, 40});
+  dequantize_per_channel_out(
+      input,
+      scale,
+      zero_point,
+      /*axis=*/0,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      optional<ScalarType>(),
+      out);
+  EXPECT_TENSOR_EQ(out, expected);
 }
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
index de82ad3ef9..9a97be98e8 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim/xplat/executorch/build/env_interface.bzl
@@ -129,6 +129,7 @@ def _remove_unsupported_kwargs(kwargs):
     kwargs.pop("tags", None)  # tags = ["long_running"] doesn't work in oss
     kwargs.pop("types", None)  # will have to find a different way to handle .pyi files in oss
     kwargs.pop("resources", None)  # doesn't support resources in python_library/python_binary yet
+    kwargs.pop("feature", None)  # internal-only, used for Product-Feature Hierarchy (PFH)
     return kwargs
 
 def _patch_headers(kwargs):
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
index 3fadd20f0d..34a8f81e87 100644
--- a/shim/xplat/executorch/codegen/codegen.bzl
+++ b/shim/xplat/executorch/codegen/codegen.bzl
@@ -350,7 +350,7 @@ def copy_portable_header_files(name):
         default_outs = ["."],
     )
 
-def build_portable_lib(name, oplist_header_name):
+def build_portable_lib(name, oplist_header_name, feature = None):
     """Build portable lib from source. We build from source so that the generated header file, 
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
@@ -400,6 +400,7 @@ def build_portable_lib(name, oplist_header_name):
         # via static initializers that run at program startup.
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+        feature = feature,
     )
 
 def executorch_generated_lib(
@@ -421,7 +422,8 @@ def executorch_generated_lib(
         platforms = get_default_executorch_platforms(),
         compiler_flags = [],
         kernel_deps = [],
-        dtype_selective_build = False):
+        dtype_selective_build = False,
+        feature = None):
     """Emits 0-3 C++ library targets (in fbcode or xplat) containing code to
     dispatch the operators specified in the provided yaml files.
 
@@ -469,6 +471,7 @@ def executorch_generated_lib(
         fbcode_deps: Additional fbcode deps, can be used to provide custom operator library.
         compiler_flags: compiler_flags args to runtime.cxx_library
         dtype_selective_build: In additional to operator selection, dtype selective build further selects the dtypes for each operator. Can be used with model or dict selective build APIs, where dtypes can be specified. Note: this is only available in xplat.
+        feature: Product-Feature Hierarchy (PFH). For internal use only, required for FoA in production. See: https://fburl.com/wiki/2wzjpyqy
     """
     if functions_yaml_target and aten_mode:
         fail("{} is providing functions_yaml_target in ATen mode, it will be ignored. `native_functions.yaml` will be the source of truth.".format(name))
@@ -560,7 +563,7 @@ def executorch_generated_lib(
 
         # Build portable lib.
         portable_lib_name = name + "_portable_lib"
-        build_portable_lib(portable_lib_name, oplist_header_name)
+        build_portable_lib(portable_lib_name, oplist_header_name, feature)
         portable_lib = [":{}".format(portable_lib_name)]
 
     # Exports headers that declare the function signatures of the C++ functions
@@ -583,6 +586,7 @@ def executorch_generated_lib(
                 "//executorch/codegen:macros",
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
             ],
+            feature = feature,
         )
 
     if name in libs:
@@ -624,6 +628,7 @@ def executorch_generated_lib(
             # of //executorch.
             _is_external_target = True,
             platforms = platforms,
+            feature = feature,
         )
 
     if custom_ops_yaml_target and custom_ops_requires_aot_registration:
diff --git a/version.txt b/version.txt
index c181bf5996..f28aaa5cd4 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.0a0
+0.4.0a0