From a04dd040b571bee67dd6bb97330db5fbddd841a5 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Sat, 16 Mar 2024 11:37:08 +0000
Subject: [PATCH] 2024-03-16 nightly release
 (39c93aaa0577eb92f28129b5cac064d197093247)

---
 .github/workflows/apple.yml                   |  51 +-
 .../glsl/{all_shaders.yaml => binary_op.yaml} |  32 --
 .../runtime/graph/ops/glsl/image_to_nchw.yaml |  21 +
 .../runtime/graph/ops/glsl/nchw_to_image.yaml |  21 +
 .../xnnpack/partition/graphs/bilinear_2d.py   |   6 +-
 backends/xnnpack/partition/graphs/sdpa.py     |   4 +-
 build/install_flatc.sh                        |  20 +-
 examples/models/llama2/custom_ops/op_sdpa.cpp |   4 -
 .../models/llama2/install_requirements.sh     |   7 +-
 examples/models/llama2/ops/quantized_ops.py   |  94 ++--
 examples/models/llama2/quantize.py            | 511 +-----------------
 exir/passes/TARGETS                           |   1 +
 .../_quant_patterns_and_replacements.py       | 134 ++++-
 extension/android/CMakeLists.txt              |   2 +-
 pyproject.toml                                |   1 +
 15 files changed, 295 insertions(+), 614 deletions(-)
 rename backends/vulkan/runtime/graph/ops/glsl/{all_shaders.yaml => binary_op.yaml} (52%)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml

diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 0a4a06aa70..0febef6e1f 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -8,7 +8,7 @@ on:
   pull_request:
     paths:
       - .ci/docker/**
-      - .github/workflows/app-build.yml
+      - .github/workflows/apple.yml
       - install_requirements.sh
       - backends/apple/**
       - build/build_apple_frameworks.sh
@@ -58,7 +58,7 @@ jobs:
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      upload-artifact: executorch.zip
+      upload-artifact: executorch-frameworks-ios
       timeout: 90
       script: |
         WORKSPACE=$(pwd)
@@ -90,3 +90,50 @@ jobs:
         zip -r "${RUNNER_TEMP}/artifacts/${OUTPUT}.zip" "${OUTPUT}"
 
         popd
+
+  upload-frameworks-ios:
+    runs-on: ubuntu-22.04
+    needs: build-frameworks-ios
+    timeout-minutes: 30
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v1.7.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios
+          aws-region: us-east-1
+      - name: Download the artifact
+        uses: actions/download-artifact@v3
+        with:
+          # NB: The name here needs to match the upload-artifact name from build-frameworks-ios job
+          name: executorch-frameworks-ios
+          path: ${{ runner.temp }}/frameworks-ios/
+      - name: Only push to S3 from main branch
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        shell: bash
+        run: |
+          set -eux
+          echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}"
+      - name: Upload the artifact to ossci-ios S3 bucket
+        shell: bash
+        run: |
+          set -eux
+
+          pip install awscli==1.32.18
+
+          AWS_CMD="aws s3 cp --dryrun"
+          if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then
+            AWS_CMD="aws s3 cp"
+          fi
+
+          for FILENAME in "${RUNNER_TEMP}"/frameworks-ios/*.zip; do
+            [ -e "${FILENAME}" ] || continue
+            ${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read
+          done
diff --git a/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
similarity index 52%
rename from backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
index a1abc6a745..01dde35aba 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -28,35 +28,3 @@ binary_op:
       OPERATOR: pow(X, Y)
     - NAME: binary_floor_divide
       OPERATOR: floor(X / Y)
-
-image_to_nchw:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: CHANNELS_PACKED
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
-  shader_variants:
-    - NAME: image3d_to_nchw_C_packed
-    - NAME: image2d_to_nchw_C_packed
-      NDIM: 2
-
-nchw_to_image:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: CHANNELS_PACKED
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
-  shader_variants:
-    - NAME: nchw_to_image3d_C_packed
-    - NAME: nchw_to_image2d_C_packed
-      NDIM: 2
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
new file mode 100644
index 0000000000..d648d842ae
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+image_to_nchw:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: image3d_to_nchw_C_packed
+    - NAME: image2d_to_nchw_C_packed
+      NDIM: 2
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
new file mode 100644
index 0000000000..1b43326b34
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+nchw_to_image:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: nchw_to_image3d_C_packed
+    - NAME: nchw_to_image2d_C_packed
+      NDIM: 2
diff --git a/backends/xnnpack/partition/graphs/bilinear_2d.py b/backends/xnnpack/partition/graphs/bilinear_2d.py
index b956c11df9..a971cb9244 100644
--- a/backends/xnnpack/partition/graphs/bilinear_2d.py
+++ b/backends/xnnpack/partition/graphs/bilinear_2d.py
@@ -10,6 +10,8 @@
 import executorch.exir as exir
 import torch
 
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+
 
 @lru_cache(maxsize=None)
 def _get_bilinear_2d_graphs():
@@ -37,7 +39,9 @@ def forward(self, x):
         for config in capture_configs:
             edge = exir.capture(
                 bilinear2d(align_corners), sample_inputs, config
-            ).to_edge()
+            ).to_edge(
+                config=get_xnnpack_edge_compile_config(),
+            )
             _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners
     return _bilinear2d_graphs
 
diff --git a/backends/xnnpack/partition/graphs/sdpa.py b/backends/xnnpack/partition/graphs/sdpa.py
index 94abfda33b..4f4afa92e2 100644
--- a/backends/xnnpack/partition/graphs/sdpa.py
+++ b/backends/xnnpack/partition/graphs/sdpa.py
@@ -8,6 +8,7 @@
 from typing import List, Optional
 
 import torch
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
 from executorch.exir import to_edge
 from torch import Tensor
 from torch.export import export
@@ -75,7 +76,8 @@ def forward(
                         v,
                         mask,
                     ),
-                )
+                ),
+                compile_config=get_xnnpack_edge_compile_config(),
             )
             gm = edge.exported_program().graph_module
             graphs.append(gm)
diff --git a/build/install_flatc.sh b/build/install_flatc.sh
index 9dfed2b7c8..75b4e41883 100755
--- a/build/install_flatc.sh
+++ b/build/install_flatc.sh
@@ -26,9 +26,23 @@ readonly NC="\033[0m" # No Color
 
 # Prints the flatbuffers version of the git submodule.
 print_flatbuffers_version(){
-    pushd "${FLATBUFFERS_PATH}" > /dev/null
-    git describe --tags "$(git rev-list --tags --max-count=1)" | sed 's/^v//'
-    popd > /dev/null
+    local version_file="${FLATBUFFERS_PATH}/package.json"
+    local version
+    # Extract the version from the first line like `"version": "23.5.26",`
+    # First remove the final double quote, then remove everything
+    # before the now-final double quote.
+    version="$(
+        grep '"version"\s*:' "${version_file}" \
+        | head -1 \
+        | sed -e 's/"[^"]*$//' \
+        | sed -e 's/.*"//'
+        )"
+    if [[ ${version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        echo "${version}"
+    else
+        echo "ERROR: Bad version '${version}'; could not find version in ${version_file}" >&2
+        exit 1
+    fi
 }
 
 main() {
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp
index 16732b7153..6638852f7d 100644
--- a/examples/models/llama2/custom_ops/op_sdpa.cpp
+++ b/examples/models/llama2/custom_ops/op_sdpa.cpp
@@ -177,10 +177,6 @@ inline void fill_stub(scalar_t* data, scalar_t val, int64_t size) {
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
     data_vec.store(data + d);
   }
-#if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE) && \
-    !defined(__ANDROID__)
-#pragma unroll
-#endif
   for (; d < size; d++) {
     data[d] = val;
   }
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
index c7a7f41ffb..3a86b9aacd 100644
--- a/examples/models/llama2/install_requirements.sh
+++ b/examples/models/llama2/install_requirements.sh
@@ -10,13 +10,8 @@
 pip install snakeviz sentencepiece
 pip install torchao-nightly
 
-# Install datasets for HuggingFace dataloader
-# v2.14.0 is intentional to force lm-eval v0.3.0 compatibility
-pip install datasets==2.14.0
-
 # Install lm-eval for Model Evaluation with lm-evalution-harness
-# v0.3.0 is intentional
-pip install lm-eval==0.3.
+pip install lm-eval
 
 # Call the install helper for further setup
 python examples/models/llama2/install_requirement_helper.py
diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py
index 2ab8df3080..5d13856442 100644
--- a/examples/models/llama2/ops/quantized_ops.py
+++ b/examples/models/llama2/ops/quantized_ops.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Optional
+
 import torch
 from torch.library import impl, impl_abstract
 
@@ -62,43 +64,45 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
     assert weight_zero_points is None or weight_zero_points.size(0) == weight.size(
         0
     ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
-    if not weight_zero_points:
-        weight_zero_points = torch.zeros(weight.size(0))
 
 
 @impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
-def embedding_byte_meta(
-    weight,
-    weight_scales,
-    weight_zero_points,
-    weight_quant_min,
-    weight_quant_max,
-    indices,
-):
+def embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+) -> torch.Tensor:
     embedding_byte_weight_checks(weight, weight_scales, weight_zero_points)
-    weight = torch.ops.quantized_decomposed.dequantize_per_channel.default(
+    group_size = weight.size(1) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
         weight,
         weight_scales,
         weight_zero_points,
-        0,
         weight_quant_min,
         weight_quant_max,
         weight.dtype,
+        group_size,
+        weight_scales.dtype,
     )
     return torch.ops.aten.embedding.default(weight, indices)
 
 
 @impl_abstract("llama_quantized::embedding_byte.out")
 def embedding_byte_out_meta(
-    weight,
-    weight_scales,
-    weight_zero_points,
-    weight_quant_min,
-    weight_quant_max,
-    indices,
-    out,
-):
-    return embedding_byte_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_byte(
         weight,
         weight_scales,
         weight_zero_points,
@@ -109,42 +113,46 @@ def embedding_byte_out_meta(
 
 
 @impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
-def embedding_byte_dtype_meta(
-    weight,
-    weight_scales,
-    weight_zero_points,
-    weight_quant_min,
-    weight_quant_max,
-    indices,
+def embedding_byte_dtype(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
     *,
-    dtype,
-):
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
     embedding_byte_weight_checks(weight, weight_scales, weight_zero_points)
-    weight = torch.ops.quantized_decomposed.dequantize_per_channel.default(
+    group_size = weight.size(1) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
         weight,
         weight_scales,
         weight_zero_points,
-        0,
         weight_quant_min,
         weight_quant_max,
         weight.dtype,
+        group_size,
+        dtype,
     )
-    return torch.ops.aten.embedding.default(weight, indices).to(dtype)
+    return torch.ops.aten.embedding.default(weight, indices)
 
 
 @impl_abstract("llama_quantized::embedding_byte.dtype_out")
 def embedding_byte_dtype_out_meta(
-    weight,
-    weight_scales,
-    weight_zero_points,
-    weight_quant_min,
-    weight_quant_max,
-    indices,
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
     *,
-    dtype,
-    out,
-):
-    return embedding_byte_dtype_meta(
+    dtype: Optional[torch.dtype] = None,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_byte_dtype(
         weight,
         weight_scales,
         weight_zero_points,
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
index 8c02700e4c..71d862d764 100644
--- a/examples/models/llama2/quantize.py
+++ b/examples/models/llama2/quantize.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
 from functools import reduce
 from math import gcd
 from typing import Dict, Optional, Tuple
@@ -14,11 +13,12 @@
 import torch.nn.functional as F
 from .ops.quantized_ops import *  # noqa
 
-from torch.ao.quantization.fx._decomposed import (
-    _quant_min_max_bounds_check,
-    quantized_decomposed_lib,
+from torchao.quantization.quant_primitives import (
+    get_group_qparams_symmetric,
+    group_quantize_tensor_symmetric,
+    pack_scales_and_zeros,
+    per_token_dynamic_quant,
 )
-from torch.library import impl
 
 
 try:
@@ -131,507 +131,6 @@ def dynamically_quantize_per_channel(
     return quant, scales, zero_points
 
 
-# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py
-quantized_decomposed_lib.define(
-    "choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
-)
-
-
-@impl(
-    quantized_decomposed_lib,
-    "choose_qparams_per_token",
-    "CompositeExplicitAutograd",
-)
-def choose_qparams_per_token(
-    input: torch.Tensor,
-    dtype: torch.dtype,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
-    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
-    every N elements with the same quantization parameter. The dimension for scales/zero_points
-    will be (M1 * M2 ... * Mn)
-
-    Args:
-       input (torch.Tensor): original float32/float16 Tensor
-       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
-
-    Returns:
-        scales and zero_points, both float32 Tensors
-    """
-
-    scales = input.abs().amax(dim=-1, keepdim=True)
-    if scales.dtype == torch.float16:
-        scales = (
-            scales.float()
-        )  # want float scales to avoid overflows for fp16, (bf16 has wide enough range)
-    if dtype == torch.int8:
-        n_bits = 8
-        quant_max = 2 ** (n_bits - 1) - 1
-    else:
-        raise Exception(f"unsupported dtype in choose_qparams_per_token: {dtype}")
-
-    scales = scales.clamp(min=1e-5).div(quant_max)
-    zero_points = torch.zeros_like(scales)
-    return scales, zero_points
-
-
-@impl(
-    quantized_decomposed_lib,
-    "choose_qparams_per_token",
-    "Meta",
-)
-def choose_qparams_per_token_meta(
-    input: torch.Tensor,
-    dtype: torch.dtype,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    size = (1, input.size(-1))
-    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
-        size, dtype=torch.int64, device=input.device
-    )
-
-
-# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py
-quantized_decomposed_lib.define(
-    "choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
-)
-
-
-@impl(
-    quantized_decomposed_lib,
-    "choose_qparams_per_token_asymmetric",
-    "CompositeExplicitAutograd",
-)
-def choose_qparams_per_token_asymmetric(
-    input: torch.Tensor,
-    dtype: torch.dtype,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
-    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
-    every N elements with the same quantization parameter. The dimension for scales/zero_points
-    will be (M1 * M2 ... * Mn)
-
-    Args:
-       input (torch.Tensor): original float32/float16 Tensor
-       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
-
-    Returns:
-        scales and zero_points, both float32 Tensors
-    """
-    # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18
-    qmin, qmax = -128, 127
-    min_val, max_val = torch.aminmax(input, dim=-1, keepdim=True)
-    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-    eps = torch.finfo(torch.float32).eps  # use xnnpack eps?
-
-    # scale
-    scale = (max_val_pos - min_val_neg) / float(qmax - qmin)
-    scale = scale.clamp(min=eps)
-
-    # zero point
-    descaled_min = min_val_neg / scale
-    descaled_max = max_val_pos / scale
-    zero_point_from_min_error = qmin + descaled_min
-    zero_point_from_max_error = qmax + descaled_max
-    zero_point = torch.where(
-        zero_point_from_min_error + zero_point_from_max_error > 0,
-        qmin - descaled_min,
-        qmax - descaled_max,
-    )
-    zero_point = torch.clamp(zero_point, qmin, qmax).round()
-
-    return scale.to(torch.float32), zero_point.to(torch.float32)
-
-
-@impl(
-    quantized_decomposed_lib,
-    "choose_qparams_per_token_asymmetric",
-    "Meta",
-)
-def choose_qparams_per_token_asymmetric_meta(
-    input: torch.Tensor,
-    dtype: torch.dtype,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    size = (1, input.size(-1))
-    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
-        size, dtype=torch.int64, device=input.device
-    )
-
-
-def _per_token_quant_qparam_dim_check(input, scales, zero_points):
-    num_tokens = math.prod(list(input.size())[:-1])
-    assert (
-        num_tokens == scales.numel()
-    ), f"num_tokens: {num_tokens} scales: {scales.size()}"
-    assert (
-        num_tokens == zero_points.numel()
-    ), f"num_tokens: {num_tokens} zero_points: {zero_points.size()}"
-
-
-quantized_decomposed_lib.define(
-    "quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
-    "int quant_min, int quant_max, ScalarType dtype) -> Tensor"
-)
-
-
-@impl(quantized_decomposed_lib, "quantize_per_token", "CompositeExplicitAutograd")
-def quantize_per_token(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-):
-    """Per token quantization for the Tensor using the quantization parameters to map
-    from floating point to quantized values. This means for a N dimension Tensor
-    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
-    every N elements with the same quantization parameter. The dimension for scales/zero_points
-    will be (M1 * M2 ... * Mn)
-
-    Args:
-       input (torch.Tensor): original float32 or bfloat16 Tensor
-       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
-       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
-       quant_min (int): minimum quantized value for output Tensor
-       quant_max (int): maximum quantized value for output Tensor
-       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
-
-    Returns:
-       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
-       are not stored in the Tensor, we are storing them in function arguments instead
-    """
-    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
-    _per_token_quant_qparam_dim_check(input, scales, zero_points)
-    input = (
-        torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype)
-    )
-    return input
-
-
-@impl(quantized_decomposed_lib, "quantize_per_token", "Meta")
-def quantize_per_token_meta(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-):
-    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
-    return torch.empty_like(input, dtype=dtype)
-
-
-quantized_decomposed_lib.define(
-    "dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
-    "int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor"
-)
-
-
-@impl(quantized_decomposed_lib, "dequantize_per_token", "CompositeExplicitAutograd")
-def dequantize_per_token(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-    output_dtype: torch.dtype = torch.float32,
-):
-    """Per token dequantization for the Tensor using the quantization parameters to map
-    from floating point to quantized values. This means for a N dimension Tensor
-    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
-    every N elements with the same quantization parameter. The dimension for scales/zero_points
-    will be (M1 * M2 ... * Mn)
-
-    Args:
-       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
-       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
-       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
-       quant_min (int): minimum quantized value for input Tensor
-       quant_max (int): maximum quantized value for input Tensor
-       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
-       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
-
-    Returns:
-       dequantized Tensor with dtype `output_dtype`
-    """
-    input = input - zero_points
-    input = input.to(output_dtype) * scales
-    return input
-
-
-@impl(quantized_decomposed_lib, "dequantize_per_token", "Meta")
-def dequantize_per_token_meta(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-    output_dtype: torch.dtype = torch.float32,
-):
-    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
-    # TODO: support fp16
-    return torch.empty_like(input, dtype=output_dtype)
-
-
-def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float32):
-    # needed for GPTQ with padding
-    if groupsize > w.shape[-1]:
-        groupsize = w.shape[-1]
-    assert groupsize > 1
-    assert w.shape[-1] % groupsize == 0
-    assert w.dim() == 2
-
-    to_quant = w.reshape(-1, groupsize)
-    assert torch.isnan(to_quant).sum() == 0
-
-    max_val = to_quant.amax(dim=1, keepdim=True)
-    min_val = to_quant.amin(dim=1, keepdim=True)
-    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-
-    max_val_abs = torch.max(-min_val_neg, max_val_pos)
-    max_int = 2 ** (n_bit - 1) - 1
-    min_int = -(2 ** (n_bit - 1))
-
-    scales = max_val_abs / (float(max_int - min_int) / 2)
-    scales = torch.max(scales, torch.full_like(scales, torch.finfo(torch.float32).eps))
-    # TODO: make sure abs(scales) is not too small?
-    zeros = torch.full_like(scales, 0)
-    return scales.to(precision).reshape(w.shape[0], -1), zeros.to(precision).reshape(
-        w.shape[0], -1
-    )
-
-
-def pack_scales_and_zeros(scales, zeros, precision=torch.float16):
-    assert scales.shape == zeros.shape
-    assert scales.dtype == precision
-    assert zeros.dtype == precision
-    return (
-        torch.cat(
-            [
-                scales.reshape(scales.size(0), scales.size(1), 1),
-                zeros.reshape(zeros.size(0), zeros.size(1), 1),
-            ],
-            2,
-        )
-        .transpose(0, 1)
-        .contiguous()
-    )
-
-
-quantized_decomposed_lib.define(
-    "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, "
-    "int quant_max, ScalarType dtype, int group_size) -> Tensor"
-)
-
-
-# TODO: dtype is ignored for now
-@impl(
-    quantized_decomposed_lib, "quantize_per_channel_group", "CompositeExplicitAutograd"
-)
-def quantize_per_channel_group(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-    group_size=128,
-):
-    assert group_size > 1
-    # needed for GPTQ single column quantize
-    if group_size > input.shape[-1] and scales.shape[-1] == 1:
-        group_size = input.shape[-1]
-
-    assert input.shape[-1] % group_size == 0
-    assert input.dim() == 2
-
-    # TODO: check for dtype, currently we can't express torch.int4 so it's omitted
-    to_quant = input.reshape(-1, group_size)
-    assert torch.isnan(to_quant).sum() == 0
-
-    scales = scales.reshape(-1, 1)
-    zero_points = zero_points.reshape(-1, 1)
-
-    input_int8 = (
-        to_quant.div(scales)
-        .add(zero_points)
-        .round()
-        .clamp_(quant_min, quant_max)
-        .to(dtype)
-        .reshape_as(input)
-    )
-
-    return input_int8
-
-
-@impl(quantized_decomposed_lib, "quantize_per_channel_group", "Meta")
-def quantize_per_channel_group_meta(
-    input: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-    group_size=128,
-):
-    """Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
-    to map from floating point to quantized values. This means for each row of a 2-d Tensor
-    (M, N), we calculate scales/zero_points for each `group_size` elements
-    and quantize every `group_size` elements with the same quantization parameter.
-    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
-
-    Args:
-       input (torch.Tensor): original float32 or bfloat16 Tensor
-       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
-       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
-       quant_min (int): minimum quantized value for output Tensor
-       quant_max (int): maximum quantized value for output Tensor
-       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
-
-    Returns:
-       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
-       are not stored in the Tensor, we are storing them in function arguments instead
-    """
-    assert group_size > 1
-    # needed for GPTQ single column quantize
-    if group_size > input.shape[-1] and scales.shape[-1] == 1:
-        group_size = input.shape[-1]
-
-    assert input.shape[-1] % group_size == 0
-    assert input.dim() == 2
-    return torch.empty_like(input, dtype=dtype)
-
-
-def group_quantize_tensor_symmetric(
-    w, n_bit=4, group_size=128, precision=torch.float32
-):
-    scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size, precision)
-    n_bit = 4
-    max_int = 2 ** (n_bit - 1) - 1
-    min_int = -(2 ** (n_bit - 1))
-    # TODO: currently we don't know how to express torch.int4, we'll
-    # add torch.int4 to core later
-    w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group(
-        w, scales, zeros, min_int, max_int, torch.int8, group_size
-    )
-
-    return w_int8, scales, zeros
-
-
-quantized_decomposed_lib.define(
-    "dequantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, "
-    "int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor"
-)
-
-
-@impl(
-    quantized_decomposed_lib,
-    "dequantize_per_channel_group",
-    "CompositeExplicitAutograd",
-)
-def dequantize_per_channel_group(
-    w_int8: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-    group_size: int = 128,
-    output_dtype: torch.dtype = torch.float32,
-):
-    """Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
-    to map from floating point to quantized values. This means for each row of a 2-d Tensor
-    (M, N), we calculate scales/zero_points for each `group_size` elements
-    and quantize every `group_size` elements with the same quantization parameter.
-    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
-
-    Args:
-       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
-       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
-       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
-       quant_min (int): minimum quantized value for input Tensor
-       quant_max (int): maximum quantized value for input Tensor
-       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
-       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
-
-    Returns:
-       dequantized Tensor with dtype `output_dtype`
-    """
-
-    assert group_size > 1
-    # needed for GPTQ single column dequantize
-    if group_size > w_int8.shape[-1] and scales.shape[-1] == 1:
-        group_size = w_int8.shape[-1]
-    assert w_int8.shape[-1] % group_size == 0
-    assert w_int8.dim() == 2
-
-    w_int8_grouped = w_int8.reshape(-1, group_size)
-    scales = scales.reshape(-1, 1)
-    zero_points = zero_points.reshape(-1, 1)
-    w_dq = (
-        w_int8_grouped.sub(zero_points).mul(scales).reshape_as(w_int8).to(output_dtype)
-    )
-    return w_dq
-
-
-def down_size(size):
-    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
-    return (*size[:-1], size[-1] // 2)
-
-
-def up_size(size):
-    return (*size[:-1], size[-1] * 2)
-
-
-quantized_decomposed_lib.define("pack_int4_from_int8(Tensor int8_data) -> Tensor")
-
-
-@impl(quantized_decomposed_lib, "pack_int4_from_int8", "CompositeExplicitAutograd")
-def pack_int4_from_int8(int8_data: torch.Tensor) -> torch.Tensor:
-    # converting to uint8 for operations
-    shape = int8_data.shape
-    assert shape[-1] % 2 == 0
-    int8_data = int8_data.contiguous().view(-1)
-    return (int8_data[::2] << 4 | int8_data[1::2]).view(down_size(shape))
-
-
-quantized_decomposed_lib.define("unpack_int4_to_int8(Tensor int8_data) -> Tensor")
-
-
-@impl(quantized_decomposed_lib, "unpack_int4_to_int8", "CompositeExplicitAutograd")
-def unpack_int4_to_int8(int8_data: torch.Tensor) -> torch.Tensor:
-    """Get the original weight from the normalized float weight format"""
-    # since we are using int8 we will decode 2 entries per byte
-    # Shift elements down 4 and select out the bottom 4 bits
-    shape = int8_data.shape
-    first_elements = (int8_data >> 4).to(torch.int8)
-    second_elements = (int8_data & 0b1111).to(torch.int8)
-    return torch.stack([first_elements, second_elements], dim=-1).view(up_size(shape))
-
-
-def per_token_dynamic_quant(input: torch.Tensor) -> torch.Tensor:
-    orig_dtype = input.dtype
-    (
-        scales,
-        zero_points,
-    ) = torch.ops.quantized_decomposed.choose_qparams_per_token(input, torch.int8)
-
-    # TODO: get these from torch.int8
-    quant_min = -128
-    quant_max = 127
-    input = torch.ops.quantized_decomposed.quantize_per_token(
-        input, scales, zero_points, quant_min, quant_max, torch.int8
-    )
-    input = torch.ops.quantized_decomposed.dequantize_per_token(
-        input, scales, zero_points, quant_min, quant_max, torch.int8, orig_dtype
-    )
-    return input
-
-
 class QuantHandler:
     def __init__(self, mod):
         self.mod = mod
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
index 04c519a8ab..56dee3eb91 100644
--- a/exir/passes/TARGETS
+++ b/exir/passes/TARGETS
@@ -127,6 +127,7 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
+        "//pytorch/ao:torchao",
     ],
 )
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
index 0cb9707774..334e739893 100644
--- a/exir/passes/_quant_patterns_and_replacements.py
+++ b/exir/passes/_quant_patterns_and_replacements.py
@@ -9,13 +9,12 @@
 
 import torch
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
-
 from executorch.exir.passes.replace_aten_with_edge_pass import (
     aten_to_edge,
     should_lower_to_edge,
 )
 from torch import fx
-from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
+from torchao.quantization.quant_primitives import quantized_decomposed_lib
 
 
 __all__ = [
@@ -487,6 +486,50 @@ def replacement(
             )
             return out
 
+        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
+        def pattern_groupwise(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indices,
+            group_size,
+        ):
+            weight = (
+                torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+                    weight,
+                    weight_scales,
+                    weight_zero_points,
+                    weight_quant_min,
+                    weight_quant_max,
+                    weight.dtype,
+                    group_size,
+                    weight_scales.dtype,
+                )
+            )
+            out = torch.ops.aten.embedding.default(weight, indices)
+            return out
+
+        def replacement_groupwise(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indices,
+            group_size,
+        ):
+            out = torch.ops.quantized_decomposed.embedding_byte.default(
+                weight,
+                weight_scales,
+                weight_zero_points,
+                weight_quant_min,
+                weight_quant_max,
+                indices,
+            )
+            return out
+
         @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
         def pattern_with_padding_idx(
             weight,
@@ -528,35 +571,86 @@ def replacement_with_padding_idx(
             )
             return out
 
-        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte.dtype")
-        def pattern_with_dtype(
+        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
+        def pattern_with_padding_idx_groupwise(
             weight,
             weight_scales,
             weight_zero_points,
             weight_quant_min,
             weight_quant_max,
-            indicies,
-            dtype,
+            indices,
+            group_size,
+            padding_idx,
         ):
-            weight = torch.ops.quantized_decomposed.dequantize_per_channel.default(
+            weight = (
+                torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+                    weight,
+                    weight_scales,
+                    weight_zero_points,
+                    weight_quant_min,
+                    weight_quant_max,
+                    weight.dtype,
+                    group_size,
+                    weight_scales.dtype,
+                )
+            )
+            out = torch.ops.aten.embedding.default(weight, indices, padding_idx)
+            return out
+
+        def replacement_with_padding_idx_groupwise(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indices,
+            group_size,
+            _,  # padding_idx only matters for training and not when running op for inference
+        ):
+            out = torch.ops.quantized_decomposed.embedding_byte.default(
                 weight,
                 weight_scales,
                 weight_zero_points,
-                0,
                 weight_quant_min,
                 weight_quant_max,
-                torch.uint8,
+                indices,
             )
-            out = torch.ops.aten.embedding.default(weight, indicies).to(dtype)
             return out
 
-        def replacement_with_dtype(
+        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte.dtype")
+        def pattern_with_dtype_groupwise(
             weight,
             weight_scales,
             weight_zero_points,
             weight_quant_min,
             weight_quant_max,
-            indicies,
+            indices,
+            group_size,
+            dtype,
+        ):
+            weight = (
+                torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+                    weight,
+                    weight_scales,
+                    weight_zero_points,
+                    weight_quant_min,
+                    weight_quant_max,
+                    weight.dtype,
+                    group_size,
+                    dtype,
+                )
+            )
+            out = torch.ops.aten.embedding.default(weight, indices)
+            return out
+
+        def replacement_with_dtype_groupwise(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indices,
+            group_size,
             dtype,
         ):
             out = torch.ops.quantized_decomposed.embedding_byte.dtype(
@@ -565,7 +659,7 @@ def replacement_with_dtype(
                 weight_zero_points,
                 weight_quant_min,
                 weight_quant_max,
-                indicies,
+                indices,
                 dtype=dtype,
             )
             return out
@@ -576,14 +670,24 @@ def replacement_with_dtype(
                 _trace_and_lower_to_edge_ops(replacement),
                 [],
             ),
+            (
+                _trace_and_lower_to_edge_ops(pattern_groupwise),
+                _trace_and_lower_to_edge_ops(replacement_groupwise),
+                [],
+            ),
             (
                 _trace_and_lower_to_edge_ops(pattern_with_padding_idx),
                 _trace_and_lower_to_edge_ops(replacement_with_padding_idx),
                 [],
             ),
             (
-                _trace_and_lower_to_edge_ops(pattern_with_dtype),
-                _trace_and_lower_to_edge_ops(replacement_with_dtype),
+                _trace_and_lower_to_edge_ops(pattern_with_padding_idx_groupwise),
+                _trace_and_lower_to_edge_ops(replacement_with_padding_idx_groupwise),
+                [],
+            ),
+            (
+                _trace_and_lower_to_edge_ops(pattern_with_dtype_groupwise),
+                _trace_and_lower_to_edge_ops(replacement_with_dtype_groupwise),
                 [],
             ),
         ]
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 76069eca92..0d5b69c52e 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -29,7 +29,7 @@ if(CMAKE_TOOLCHAIN_ANDROID)
 
     add_library(executorch_llama_jni SHARED jni/jni_layer_llama.cpp)
     target_link_libraries(executorch_llama_jni fbjni llama_runner
-                          portable_ops_lib)
+                          xnn_executor_runner_lib)
     target_compile_options(executorch_llama_jni PUBLIC
                            ${_common_compile_options})
 endif()
diff --git a/pyproject.toml b/pyproject.toml
index dc61ca2e7b..b302adc801 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies=[
   "sympy",
   "tabulate",
   "tomli",
+  "torchao-nightly",
   "zstd",
 ]