From a04dd040b571bee67dd6bb97330db5fbddd841a5 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Sat, 16 Mar 2024 11:37:08 +0000 Subject: [PATCH] 2024-03-16 nightly release (39c93aaa0577eb92f28129b5cac064d197093247) --- .github/workflows/apple.yml | 51 +- .../glsl/{all_shaders.yaml => binary_op.yaml} | 32 -- .../runtime/graph/ops/glsl/image_to_nchw.yaml | 21 + .../runtime/graph/ops/glsl/nchw_to_image.yaml | 21 + .../xnnpack/partition/graphs/bilinear_2d.py | 6 +- backends/xnnpack/partition/graphs/sdpa.py | 4 +- build/install_flatc.sh | 20 +- examples/models/llama2/custom_ops/op_sdpa.cpp | 4 - .../models/llama2/install_requirements.sh | 7 +- examples/models/llama2/ops/quantized_ops.py | 94 ++-- examples/models/llama2/quantize.py | 511 +----------------- exir/passes/TARGETS | 1 + .../_quant_patterns_and_replacements.py | 134 ++++- extension/android/CMakeLists.txt | 2 +- pyproject.toml | 1 + 15 files changed, 295 insertions(+), 614 deletions(-) rename backends/vulkan/runtime/graph/ops/glsl/{all_shaders.yaml => binary_op.yaml} (52%) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 0a4a06aa70..0febef6e1f 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -8,7 +8,7 @@ on: pull_request: paths: - .ci/docker/** - - .github/workflows/app-build.yml + - .github/workflows/apple.yml - install_requirements.sh - backends/apple/** - build/build_apple_frameworks.sh @@ -58,7 +58,7 @@ jobs: python-version: '3.11' submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - upload-artifact: executorch.zip + upload-artifact: executorch-frameworks-ios timeout: 90 script: | WORKSPACE=$(pwd) @@ -90,3 +90,50 @@ jobs: zip -r "${RUNNER_TEMP}/artifacts/${OUTPUT}.zip" "${OUTPUT}" popd + + upload-frameworks-ios: + runs-on: ubuntu-22.04 + needs: build-frameworks-ios + timeout-minutes: 30 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios + aws-region: us-east-1 + - name: Download the artifact + uses: actions/download-artifact@v3 + with: + # NB: The name here needs to match the upload-artifact name from build-frameworks-ios job + name: executorch-frameworks-ios + path: ${{ runner.temp }}/frameworks-ios/ + - name: Only push to S3 from main branch + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + shell: bash + run: | + set -eux + echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}" + - name: Upload the artifact to ossci-ios S3 bucket + shell: bash + run: | + set -eux + + pip install awscli==1.32.18 + + AWS_CMD="aws s3 cp --dryrun" + if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then + AWS_CMD="aws s3 cp" + fi + + for FILENAME in "${RUNNER_TEMP}"/frameworks-ios/*.zip; do + [ -e "${FILENAME}" ] || continue + ${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read + done diff --git a/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml similarity index 52% rename from backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml rename to backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml index a1abc6a745..01dde35aba 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml @@ -28,35 +28,3 @@ binary_op: OPERATOR: pow(X, Y) - NAME: binary_floor_divide OPERATOR: floor(X / Y) - -image_to_nchw: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: CHANNELS_PACKED - generate_variant_forall: - DTYPE: - - VALUE: "half" - SUFFIX: "half" - - VALUE: "float" - SUFFIX: "float" - shader_variants: - - NAME: image3d_to_nchw_C_packed - - NAME: image2d_to_nchw_C_packed - NDIM: 2 - -nchw_to_image: - parameter_names_with_default_values: - NDIM: 3 - DTYPE: float - PACKING: CHANNELS_PACKED - generate_variant_forall: - DTYPE: - - VALUE: "half" - SUFFIX: "half" - - VALUE: "float" - SUFFIX: "float" - shader_variants: - - NAME: nchw_to_image3d_C_packed - - NAME: nchw_to_image2d_C_packed - NDIM: 2 diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml new file mode 100644 index 0000000000..d648d842ae --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +image_to_nchw: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: CHANNELS_PACKED + generate_variant_forall: + DTYPE: + - VALUE: "half" + SUFFIX: "half" + - VALUE: "float" + SUFFIX: "float" + shader_variants: + - NAME: image3d_to_nchw_C_packed + - NAME: image2d_to_nchw_C_packed + NDIM: 2 diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml new file mode 100644 index 0000000000..1b43326b34 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +nchw_to_image: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: CHANNELS_PACKED + generate_variant_forall: + DTYPE: + - VALUE: "half" + SUFFIX: "half" + - VALUE: "float" + SUFFIX: "float" + shader_variants: + - NAME: nchw_to_image3d_C_packed + - NAME: nchw_to_image2d_C_packed + NDIM: 2 diff --git a/backends/xnnpack/partition/graphs/bilinear_2d.py b/backends/xnnpack/partition/graphs/bilinear_2d.py index b956c11df9..a971cb9244 100644 --- a/backends/xnnpack/partition/graphs/bilinear_2d.py +++ b/backends/xnnpack/partition/graphs/bilinear_2d.py @@ -10,6 +10,8 @@ import executorch.exir as exir import torch +from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config + @lru_cache(maxsize=None) def _get_bilinear_2d_graphs(): @@ -37,7 +39,9 @@ def forward(self, x): for config in capture_configs: edge = exir.capture( bilinear2d(align_corners), sample_inputs, config - ).to_edge() + ).to_edge( + config=get_xnnpack_edge_compile_config(), + ) _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners return _bilinear2d_graphs diff --git a/backends/xnnpack/partition/graphs/sdpa.py b/backends/xnnpack/partition/graphs/sdpa.py index 94abfda33b..4f4afa92e2 100644 --- a/backends/xnnpack/partition/graphs/sdpa.py +++ b/backends/xnnpack/partition/graphs/sdpa.py @@ -8,6 +8,7 @@ from typing import List, Optional import torch +from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config from executorch.exir import to_edge from torch import Tensor from torch.export import export @@ -75,7 +76,8 @@ def forward( v, mask, ), - ) + ), + compile_config=get_xnnpack_edge_compile_config(), ) gm = edge.exported_program().graph_module graphs.append(gm) diff --git a/build/install_flatc.sh b/build/install_flatc.sh index 9dfed2b7c8..75b4e41883 100755 --- a/build/install_flatc.sh +++ b/build/install_flatc.sh @@ -26,9 +26,23 @@ readonly NC="\033[0m" # No Color # Prints the flatbuffers version of the git submodule. print_flatbuffers_version(){ - pushd "${FLATBUFFERS_PATH}" > /dev/null - git describe --tags "$(git rev-list --tags --max-count=1)" | sed 's/^v//' - popd > /dev/null + local version_file="${FLATBUFFERS_PATH}/package.json" + local version + # Extract the version from the first line like `"version": "23.5.26",` + # First remove the final double quote, then remove everything + # before the now-final double quote. + version="$( + grep '"version"\s*:' "${version_file}" \ + | head -1 \ + | sed -e 's/"[^"]*$//' \ + | sed -e 's/.*"//' + )" + if [[ ${version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "${version}" + else + echo "ERROR: Bad version '${version}'; could not find version in ${version_file}" >&2 + exit 1 + fi } main() { diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp index 16732b7153..6638852f7d 100644 --- a/examples/models/llama2/custom_ops/op_sdpa.cpp +++ b/examples/models/llama2/custom_ops/op_sdpa.cpp @@ -177,10 +177,6 @@ inline void fill_stub(scalar_t* data, scalar_t val, int64_t size) { for (; d < size - (size % Vec::size()); d += Vec::size()) { data_vec.store(data + d); } -#if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE) && \ - !defined(__ANDROID__) -#pragma unroll -#endif for (; d < size; d++) { data[d] = val; } diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh index c7a7f41ffb..3a86b9aacd 100644 --- a/examples/models/llama2/install_requirements.sh +++ b/examples/models/llama2/install_requirements.sh @@ -10,13 +10,8 @@ pip install snakeviz sentencepiece pip install torchao-nightly -# Install datasets for HuggingFace dataloader -# v2.14.0 is intentional to force lm-eval v0.3.0 compatibility -pip install datasets==2.14.0 - # Install lm-eval for Model Evaluation with lm-evalution-harness -# v0.3.0 is intentional -pip install lm-eval==0.3. +pip install lm-eval # Call the install helper for further setup python examples/models/llama2/install_requirement_helper.py diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py index 2ab8df3080..5d13856442 100644 --- a/examples/models/llama2/ops/quantized_ops.py +++ b/examples/models/llama2/ops/quantized_ops.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Optional + import torch from torch.library import impl, impl_abstract @@ -62,43 +64,45 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points): assert weight_zero_points is None or weight_zero_points.size(0) == weight.size( 0 ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}" - if not weight_zero_points: - weight_zero_points = torch.zeros(weight.size(0)) @impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd") -def embedding_byte_meta( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indices, -): +def embedding_byte( + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: Optional[torch.Tensor], + weight_quant_min: int, + weight_quant_max: int, + indices: torch.Tensor, +) -> torch.Tensor: embedding_byte_weight_checks(weight, weight_scales, weight_zero_points) - weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( + group_size = weight.size(1) // ( + weight_scales.size(1) if weight_scales.dim() == 2 else 1 + ) + weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default( weight, weight_scales, weight_zero_points, - 0, weight_quant_min, weight_quant_max, weight.dtype, + group_size, + weight_scales.dtype, ) return torch.ops.aten.embedding.default(weight, indices) @impl_abstract("llama_quantized::embedding_byte.out") def embedding_byte_out_meta( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indices, - out, -): - return embedding_byte_meta( + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: Optional[torch.Tensor], + weight_quant_min: int, + weight_quant_max: int, + indices: torch.Tensor, + out: torch.Tensor, +) -> torch.Tensor: + return embedding_byte( weight, weight_scales, weight_zero_points, @@ -109,42 +113,46 @@ def embedding_byte_out_meta( @impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd") -def embedding_byte_dtype_meta( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indices, +def embedding_byte_dtype( + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: Optional[torch.Tensor], + weight_quant_min: int, + weight_quant_max: int, + indices: torch.Tensor, *, - dtype, -): + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: embedding_byte_weight_checks(weight, weight_scales, weight_zero_points) - weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( + group_size = weight.size(1) // ( + weight_scales.size(1) if weight_scales.dim() == 2 else 1 + ) + weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default( weight, weight_scales, weight_zero_points, - 0, weight_quant_min, weight_quant_max, weight.dtype, + group_size, + dtype, ) - return torch.ops.aten.embedding.default(weight, indices).to(dtype) + return torch.ops.aten.embedding.default(weight, indices) @impl_abstract("llama_quantized::embedding_byte.dtype_out") def embedding_byte_dtype_out_meta( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indices, + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: Optional[torch.Tensor], + weight_quant_min: int, + weight_quant_max: int, + indices: torch.Tensor, *, - dtype, - out, -): - return embedding_byte_dtype_meta( + dtype: Optional[torch.dtype] = None, + out: torch.Tensor, +) -> torch.Tensor: + return embedding_byte_dtype( weight, weight_scales, weight_zero_points, diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index 8c02700e4c..71d862d764 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import math from functools import reduce from math import gcd from typing import Dict, Optional, Tuple @@ -14,11 +13,12 @@ import torch.nn.functional as F from .ops.quantized_ops import * # noqa -from torch.ao.quantization.fx._decomposed import ( - _quant_min_max_bounds_check, - quantized_decomposed_lib, +from torchao.quantization.quant_primitives import ( + get_group_qparams_symmetric, + group_quantize_tensor_symmetric, + pack_scales_and_zeros, + per_token_dynamic_quant, ) -from torch.library import impl try: @@ -131,507 +131,6 @@ def dynamically_quantize_per_channel( return quant, scales, zero_points -# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py -quantized_decomposed_lib.define( - "choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)" -) - - -@impl( - quantized_decomposed_lib, - "choose_qparams_per_token", - "CompositeExplicitAutograd", -) -def choose_qparams_per_token( - input: torch.Tensor, - dtype: torch.dtype, -) -> Tuple[torch.Tensor, torch.Tensor]: - """Choose quantization parameters for per token quantization. This means for a N dimension Tensor - (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize - every N elements with the same quantization parameter. The dimension for scales/zero_points - will be (M1 * M2 ... * Mn) - - Args: - input (torch.Tensor): original float32/float16 Tensor - dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor - - Returns: - scales and zero_points, both float32 Tensors - """ - - scales = input.abs().amax(dim=-1, keepdim=True) - if scales.dtype == torch.float16: - scales = ( - scales.float() - ) # want float scales to avoid overflows for fp16, (bf16 has wide enough range) - if dtype == torch.int8: - n_bits = 8 - quant_max = 2 ** (n_bits - 1) - 1 - else: - raise Exception(f"unsupported dtype in choose_qparams_per_token: {dtype}") - - scales = scales.clamp(min=1e-5).div(quant_max) - zero_points = torch.zeros_like(scales) - return scales, zero_points - - -@impl( - quantized_decomposed_lib, - "choose_qparams_per_token", - "Meta", -) -def choose_qparams_per_token_meta( - input: torch.Tensor, - dtype: torch.dtype, -) -> Tuple[torch.Tensor, torch.Tensor]: - size = (1, input.size(-1)) - return torch.empty(size, dtype=torch.double, device=input.device), torch.empty( - size, dtype=torch.int64, device=input.device - ) - - -# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py -quantized_decomposed_lib.define( - "choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)" -) - - -@impl( - quantized_decomposed_lib, - "choose_qparams_per_token_asymmetric", - "CompositeExplicitAutograd", -) -def choose_qparams_per_token_asymmetric( - input: torch.Tensor, - dtype: torch.dtype, -) -> Tuple[torch.Tensor, torch.Tensor]: - """Choose quantization parameters for per token quantization. This means for a N dimension Tensor - (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize - every N elements with the same quantization parameter. The dimension for scales/zero_points - will be (M1 * M2 ... * Mn) - - Args: - input (torch.Tensor): original float32/float16 Tensor - dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor - - Returns: - scales and zero_points, both float32 Tensors - """ - # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18 - qmin, qmax = -128, 127 - min_val, max_val = torch.aminmax(input, dim=-1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - eps = torch.finfo(torch.float32).eps # use xnnpack eps? - - # scale - scale = (max_val_pos - min_val_neg) / float(qmax - qmin) - scale = scale.clamp(min=eps) - - # zero point - descaled_min = min_val_neg / scale - descaled_max = max_val_pos / scale - zero_point_from_min_error = qmin + descaled_min - zero_point_from_max_error = qmax + descaled_max - zero_point = torch.where( - zero_point_from_min_error + zero_point_from_max_error > 0, - qmin - descaled_min, - qmax - descaled_max, - ) - zero_point = torch.clamp(zero_point, qmin, qmax).round() - - return scale.to(torch.float32), zero_point.to(torch.float32) - - -@impl( - quantized_decomposed_lib, - "choose_qparams_per_token_asymmetric", - "Meta", -) -def choose_qparams_per_token_asymmetric_meta( - input: torch.Tensor, - dtype: torch.dtype, -) -> Tuple[torch.Tensor, torch.Tensor]: - size = (1, input.size(-1)) - return torch.empty(size, dtype=torch.double, device=input.device), torch.empty( - size, dtype=torch.int64, device=input.device - ) - - -def _per_token_quant_qparam_dim_check(input, scales, zero_points): - num_tokens = math.prod(list(input.size())[:-1]) - assert ( - num_tokens == scales.numel() - ), f"num_tokens: {num_tokens} scales: {scales.size()}" - assert ( - num_tokens == zero_points.numel() - ), f"num_tokens: {num_tokens} zero_points: {zero_points.size()}" - - -quantized_decomposed_lib.define( - "quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, " - "int quant_min, int quant_max, ScalarType dtype) -> Tensor" -) - - -@impl(quantized_decomposed_lib, "quantize_per_token", "CompositeExplicitAutograd") -def quantize_per_token( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, -): - """Per token quantization for the Tensor using the quantization parameters to map - from floating point to quantized values. This means for a N dimension Tensor - (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize - every N elements with the same quantization parameter. The dimension for scales/zero_points - will be (M1 * M2 ... * Mn) - - Args: - input (torch.Tensor): original float32 or bfloat16 Tensor - scales (float32 torch.Tensor): quantization parameter for per token affine quantization - zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization - quant_min (int): minimum quantized value for output Tensor - quant_max (int): maximum quantized value for output Tensor - dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor - - Returns: - Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters - are not stored in the Tensor, we are storing them in function arguments instead - """ - _quant_min_max_bounds_check(quant_min, quant_max, dtype) - _per_token_quant_qparam_dim_check(input, scales, zero_points) - input = ( - torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype) - ) - return input - - -@impl(quantized_decomposed_lib, "quantize_per_token", "Meta") -def quantize_per_token_meta( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, -): - _quant_min_max_bounds_check(quant_min, quant_max, dtype) - return torch.empty_like(input, dtype=dtype) - - -quantized_decomposed_lib.define( - "dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, " - "int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor" -) - - -@impl(quantized_decomposed_lib, "dequantize_per_token", "CompositeExplicitAutograd") -def dequantize_per_token( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, - output_dtype: torch.dtype = torch.float32, -): - """Per token dequantization for the Tensor using the quantization parameters to map - from floating point to quantized values. This means for a N dimension Tensor - (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize - every N elements with the same quantization parameter. The dimension for scales/zero_points - will be (M1 * M2 ... * Mn) - - Args: - input (torch.Tensor): quantized Tensor (uint8, int8 etc.) - scales (float32 torch.Tensor): quantization parameter for per token affine quantization - zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization - quant_min (int): minimum quantized value for input Tensor - quant_max (int): maximum quantized value for input Tensor - dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor - output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor - - Returns: - dequantized Tensor with dtype `output_dtype` - """ - input = input - zero_points - input = input.to(output_dtype) * scales - return input - - -@impl(quantized_decomposed_lib, "dequantize_per_token", "Meta") -def dequantize_per_token_meta( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, - output_dtype: torch.dtype = torch.float32, -): - _quant_min_max_bounds_check(quant_min, quant_max, dtype) - # TODO: support fp16 - return torch.empty_like(input, dtype=output_dtype) - - -def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float32): - # needed for GPTQ with padding - if groupsize > w.shape[-1]: - groupsize = w.shape[-1] - assert groupsize > 1 - assert w.shape[-1] % groupsize == 0 - assert w.dim() == 2 - - to_quant = w.reshape(-1, groupsize) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max(scales, torch.full_like(scales, torch.finfo(torch.float32).eps)) - # TODO: make sure abs(scales) is not too small? - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0], -1), zeros.to(precision).reshape( - w.shape[0], -1 - ) - - -def pack_scales_and_zeros(scales, zeros, precision=torch.float16): - assert scales.shape == zeros.shape - assert scales.dtype == precision - assert zeros.dtype == precision - return ( - torch.cat( - [ - scales.reshape(scales.size(0), scales.size(1), 1), - zeros.reshape(zeros.size(0), zeros.size(1), 1), - ], - 2, - ) - .transpose(0, 1) - .contiguous() - ) - - -quantized_decomposed_lib.define( - "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, " - "int quant_max, ScalarType dtype, int group_size) -> Tensor" -) - - -# TODO: dtype is ignored for now -@impl( - quantized_decomposed_lib, "quantize_per_channel_group", "CompositeExplicitAutograd" -) -def quantize_per_channel_group( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, - group_size=128, -): - assert group_size > 1 - # needed for GPTQ single column quantize - if group_size > input.shape[-1] and scales.shape[-1] == 1: - group_size = input.shape[-1] - - assert input.shape[-1] % group_size == 0 - assert input.dim() == 2 - - # TODO: check for dtype, currently we can't express torch.int4 so it's omitted - to_quant = input.reshape(-1, group_size) - assert torch.isnan(to_quant).sum() == 0 - - scales = scales.reshape(-1, 1) - zero_points = zero_points.reshape(-1, 1) - - input_int8 = ( - to_quant.div(scales) - .add(zero_points) - .round() - .clamp_(quant_min, quant_max) - .to(dtype) - .reshape_as(input) - ) - - return input_int8 - - -@impl(quantized_decomposed_lib, "quantize_per_channel_group", "Meta") -def quantize_per_channel_group_meta( - input: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, - group_size=128, -): - """Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters - to map from floating point to quantized values. This means for each row of a 2-d Tensor - (M, N), we calculate scales/zero_points for each `group_size` elements - and quantize every `group_size` elements with the same quantization parameter. - The dimension for scales/zero_points will be (M * ceil(N, group_size),) - - Args: - input (torch.Tensor): original float32 or bfloat16 Tensor - scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization - zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization - quant_min (int): minimum quantized value for output Tensor - quant_max (int): maximum quantized value for output Tensor - dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor - - Returns: - Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters - are not stored in the Tensor, we are storing them in function arguments instead - """ - assert group_size > 1 - # needed for GPTQ single column quantize - if group_size > input.shape[-1] and scales.shape[-1] == 1: - group_size = input.shape[-1] - - assert input.shape[-1] % group_size == 0 - assert input.dim() == 2 - return torch.empty_like(input, dtype=dtype) - - -def group_quantize_tensor_symmetric( - w, n_bit=4, group_size=128, precision=torch.float32 -): - scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size, precision) - n_bit = 4 - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - # TODO: currently we don't know how to express torch.int4, we'll - # add torch.int4 to core later - w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group( - w, scales, zeros, min_int, max_int, torch.int8, group_size - ) - - return w_int8, scales, zeros - - -quantized_decomposed_lib.define( - "dequantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, " - "int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor" -) - - -@impl( - quantized_decomposed_lib, - "dequantize_per_channel_group", - "CompositeExplicitAutograd", -) -def dequantize_per_channel_group( - w_int8: torch.Tensor, - scales: torch.Tensor, - zero_points: torch.Tensor, - quant_min: int, - quant_max: int, - dtype: torch.dtype, - group_size: int = 128, - output_dtype: torch.dtype = torch.float32, -): - """Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters - to map from floating point to quantized values. This means for each row of a 2-d Tensor - (M, N), we calculate scales/zero_points for each `group_size` elements - and quantize every `group_size` elements with the same quantization parameter. - The dimension for scales/zero_points will be (M * ceil(N, group_size),) - - Args: - input (torch.Tensor): quantized Tensor (uint8/int8 etc.) - scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization - zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization - quant_min (int): minimum quantized value for input Tensor - quant_max (int): maximum quantized value for input Tensor - dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor - output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor - - Returns: - dequantized Tensor with dtype `output_dtype` - """ - - assert group_size > 1 - # needed for GPTQ single column dequantize - if group_size > w_int8.shape[-1] and scales.shape[-1] == 1: - group_size = w_int8.shape[-1] - assert w_int8.shape[-1] % group_size == 0 - assert w_int8.dim() == 2 - - w_int8_grouped = w_int8.reshape(-1, group_size) - scales = scales.reshape(-1, 1) - zero_points = zero_points.reshape(-1, 1) - w_dq = ( - w_int8_grouped.sub(zero_points).mul(scales).reshape_as(w_int8).to(output_dtype) - ) - return w_dq - - -def down_size(size): - assert size[-1] % 2 == 0, f"{size} last dim not divisible by two" - return (*size[:-1], size[-1] // 2) - - -def up_size(size): - return (*size[:-1], size[-1] * 2) - - -quantized_decomposed_lib.define("pack_int4_from_int8(Tensor int8_data) -> Tensor") - - -@impl(quantized_decomposed_lib, "pack_int4_from_int8", "CompositeExplicitAutograd") -def pack_int4_from_int8(int8_data: torch.Tensor) -> torch.Tensor: - # converting to uint8 for operations - shape = int8_data.shape - assert shape[-1] % 2 == 0 - int8_data = int8_data.contiguous().view(-1) - return (int8_data[::2] << 4 | int8_data[1::2]).view(down_size(shape)) - - -quantized_decomposed_lib.define("unpack_int4_to_int8(Tensor int8_data) -> Tensor") - - -@impl(quantized_decomposed_lib, "unpack_int4_to_int8", "CompositeExplicitAutograd") -def unpack_int4_to_int8(int8_data: torch.Tensor) -> torch.Tensor: - """Get the original weight from the normalized float weight format""" - # since we are using int8 we will decode 2 entries per byte - # Shift elements down 4 and select out the bottom 4 bits - shape = int8_data.shape - first_elements = (int8_data >> 4).to(torch.int8) - second_elements = (int8_data & 0b1111).to(torch.int8) - return torch.stack([first_elements, second_elements], dim=-1).view(up_size(shape)) - - -def per_token_dynamic_quant(input: torch.Tensor) -> torch.Tensor: - orig_dtype = input.dtype - ( - scales, - zero_points, - ) = torch.ops.quantized_decomposed.choose_qparams_per_token(input, torch.int8) - - # TODO: get these from torch.int8 - quant_min = -128 - quant_max = 127 - input = torch.ops.quantized_decomposed.quantize_per_token( - input, scales, zero_points, quant_min, quant_max, torch.int8 - ) - input = torch.ops.quantized_decomposed.dequantize_per_token( - input, scales, zero_points, quant_min, quant_max, torch.int8, orig_dtype - ) - return input - - class QuantHandler: def __init__(self, mod): self.mod = mod diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS index 04c519a8ab..56dee3eb91 100644 --- a/exir/passes/TARGETS +++ b/exir/passes/TARGETS @@ -127,6 +127,7 @@ python_library( "//caffe2:torch", "//executorch/exir:pass_base", "//executorch/exir/dialects:lib", + "//pytorch/ao:torchao", ], ) diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py index 0cb9707774..334e739893 100644 --- a/exir/passes/_quant_patterns_and_replacements.py +++ b/exir/passes/_quant_patterns_and_replacements.py @@ -9,13 +9,12 @@ import torch from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops - from executorch.exir.passes.replace_aten_with_edge_pass import ( aten_to_edge, should_lower_to_edge, ) from torch import fx -from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib +from torchao.quantization.quant_primitives import quantized_decomposed_lib __all__ = [ @@ -487,6 +486,50 @@ def replacement( ) return out + @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") + def pattern_groupwise( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + group_size, + ): + weight = ( + torch.ops.quantized_decomposed.dequantize_per_channel_group.default( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + weight.dtype, + group_size, + weight_scales.dtype, + ) + ) + out = torch.ops.aten.embedding.default(weight, indices) + return out + + def replacement_groupwise( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + group_size, + ): + out = torch.ops.quantized_decomposed.embedding_byte.default( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + ) + return out + @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") def pattern_with_padding_idx( weight, @@ -528,35 +571,86 @@ def replacement_with_padding_idx( ) return out - @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte.dtype") - def pattern_with_dtype( + @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") + def pattern_with_padding_idx_groupwise( weight, weight_scales, weight_zero_points, weight_quant_min, weight_quant_max, - indicies, - dtype, + indices, + group_size, + padding_idx, ): - weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( + weight = ( + torch.ops.quantized_decomposed.dequantize_per_channel_group.default( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + weight.dtype, + group_size, + weight_scales.dtype, + ) + ) + out = torch.ops.aten.embedding.default(weight, indices, padding_idx) + return out + + def replacement_with_padding_idx_groupwise( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + group_size, + _, # padding_idx only matters for training and not when running op for inference + ): + out = torch.ops.quantized_decomposed.embedding_byte.default( weight, weight_scales, weight_zero_points, - 0, weight_quant_min, weight_quant_max, - torch.uint8, + indices, ) - out = torch.ops.aten.embedding.default(weight, indicies).to(dtype) return out - def replacement_with_dtype( + @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte.dtype") + def pattern_with_dtype_groupwise( weight, weight_scales, weight_zero_points, weight_quant_min, weight_quant_max, - indicies, + indices, + group_size, + dtype, + ): + weight = ( + torch.ops.quantized_decomposed.dequantize_per_channel_group.default( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + weight.dtype, + group_size, + dtype, + ) + ) + out = torch.ops.aten.embedding.default(weight, indices) + return out + + def replacement_with_dtype_groupwise( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + group_size, dtype, ): out = torch.ops.quantized_decomposed.embedding_byte.dtype( @@ -565,7 +659,7 @@ def replacement_with_dtype( weight_zero_points, weight_quant_min, weight_quant_max, - indicies, + indices, dtype=dtype, ) return out @@ -576,14 +670,24 @@ def replacement_with_dtype( _trace_and_lower_to_edge_ops(replacement), [], ), + ( + _trace_and_lower_to_edge_ops(pattern_groupwise), + _trace_and_lower_to_edge_ops(replacement_groupwise), + [], + ), ( _trace_and_lower_to_edge_ops(pattern_with_padding_idx), _trace_and_lower_to_edge_ops(replacement_with_padding_idx), [], ), ( - _trace_and_lower_to_edge_ops(pattern_with_dtype), - _trace_and_lower_to_edge_ops(replacement_with_dtype), + _trace_and_lower_to_edge_ops(pattern_with_padding_idx_groupwise), + _trace_and_lower_to_edge_ops(replacement_with_padding_idx_groupwise), + [], + ), + ( + _trace_and_lower_to_edge_ops(pattern_with_dtype_groupwise), + _trace_and_lower_to_edge_ops(replacement_with_dtype_groupwise), [], ), ] diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 76069eca92..0d5b69c52e 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -29,7 +29,7 @@ if(CMAKE_TOOLCHAIN_ANDROID) add_library(executorch_llama_jni SHARED jni/jni_layer_llama.cpp) target_link_libraries(executorch_llama_jni fbjni llama_runner - portable_ops_lib) + xnn_executor_runner_lib) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) endif() diff --git a/pyproject.toml b/pyproject.toml index dc61ca2e7b..b302adc801 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies=[ "sympy", "tabulate", "tomli", + "torchao-nightly", "zstd", ]