Skip to content

Commit

Permalink
Qualcomm AI Engine Direct - add program validation (#4297)
Browse files Browse the repository at this point in the history
Summary:
- update graph signature for get_fake_program to work properly
- make sure program is valid after capture_program
- retire cature_pre_autograd_graph
- fix release build error & make cross-compile flatcc deterministic
- some minor fix

Pull Request resolved: #4297

Reviewed By: kirklandsign

Differential Revision: D59984308

Pulled By: cccclai

fbshipit-source-id: 9b2a3eaabbc175db6209b12af95639e96fe95b7b
  • Loading branch information
haowhsu-quic authored and facebook-github-bot committed Jul 22, 2024
1 parent f0364e8 commit 0e2b205
Show file tree
Hide file tree
Showing 14 changed files with 39 additions and 35 deletions.
3 changes: 0 additions & 3 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ endif()
if(CMAKE_BUILD_TYPE STREQUAL "Release")
# strip symbols
add_link_options("-s")
# hide dynamic symbols
set(CMAKE_C_VISIBILITY_PRESET hidden)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)

# --gc-sections is added by torch.
add_compile_options(
Expand Down
4 changes: 4 additions & 0 deletions backends/qualcomm/builders/op_prelu.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def define_node(
coeff_node = node.args[1]
coeff_tensor = torch.zeros(input_node.meta["val"].shape)
coeff = get_parameter(coeff_node, self.edge_program)
# param nodes will be FakeTensor when doing partition
# fill in random numeric for validation
if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
coeff = torch.ones(coeff.shape)
# per-channel activation
if coeff_node.meta["val"].shape[0] > 1:
for i in range(input_node.meta["val"].shape[1]):
Expand Down
16 changes: 11 additions & 5 deletions backends/qualcomm/builders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@
from typing import Dict, Optional

import torch
from torch._export.utils import get_buffer, get_param, is_buffer, is_param
from torch._export.utils import (
get_buffer,
get_lifted_tensor_constant,
get_param,
is_buffer,
is_lifted_tensor_constant,
is_param,
)


def is_parameter(
Expand All @@ -16,7 +23,7 @@ def is_parameter(
return (
is_param(edge_program, node)
or is_buffer(edge_program, node)
or node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants
or is_lifted_tensor_constant(edge_program, node)
)


Expand All @@ -28,9 +35,8 @@ def get_parameter(
param = get_param(edge_program, node)
if is_buffer(edge_program, node):
param = get_buffer(edge_program, node)
if node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[node.name]
param = edge_program.constants[name]
if is_lifted_tensor_constant(edge_program, node):
param = get_lifted_tensor_constant(edge_program, node)
if param is not None:
# update node.meta["val"] to qualified QNN datatype (e.g. i64 to i32)
assert isinstance(param, torch.Tensor), "Expect parameter to be tensor"
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/passes/recompose_pixel_unshuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, quantization_capture=False):

self.quantization_capture = quantization_capture
if quantization_capture:
self.reshape_target = torch.ops.aten.reshape.default
self.reshape_target = torch.ops.aten._unsafe_view.default
self.permute_target = torch.ops.aten.permute.default
self.view_target = torch.ops.aten.view.default
self.op = torch.ops.aten.pixel_unshuffle.default
Expand Down
7 changes: 5 additions & 2 deletions backends/qualcomm/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ CMAKE_X86_64="build_x86_64"
BUILD_AARCH64="true"
CMAKE_AARCH64="build_android"
CLEAN="true"
BUILD_TYPE="Debug"

if [ -z PYTHON_EXECUTABLE ]; then
PYTHON_EXECUTABLE="python3"
Expand All @@ -38,7 +39,7 @@ if [ -z BUCK2 ]; then
BUCK2="buck2"
fi

long_options=skip_x86_64,skip_aarch64,no_clean
long_options=skip_x86_64,skip_aarch64,no_clean,release

parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@")
eval set -- "$parsed_args"
Expand All @@ -49,6 +50,7 @@ while true ; do
--skip_x86_64) BUILD_X86_64="false"; shift;;
--skip_aarch64) BUILD_AARCH64="false"; shift;;
--no_clean) CLEAN="false"; shift;;
--release) BUILD_TYPE="Release"; shift;;
--) shift; break;;
esac
done
Expand All @@ -66,9 +68,9 @@ if [ "$BUILD_AARCH64" = true ]; then
fi

cd $BUILD_ROOT
# If we build debug type, we need to change flatcc to flatcc_d
cmake .. \
-DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DEXECUTORCH_BUILD_QNN=ON \
-DEXECUTORCH_BUILD_SDK=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
Expand All @@ -87,6 +89,7 @@ if [ "$BUILD_AARCH64" = true ]; then

cmake $PRJ_ROOT/$EXAMPLE_ROOT \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DANDROID_ABI='arm64-v8a' \
-DANDROID_NATIVE_API_LEVEL=23 \
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def get_qdq_module(
custom_quant_annotations: Tuple[Callable] = (),
quant_dtype: QuantDtype = QuantDtype.use_8a8w,
) -> torch.fx.GraphModule:
m = torch._export.capture_pre_autograd_graph(module, inputs)
m = torch.export.export(module, inputs).module()

quantizer = QnnQuantizer()
quantizer.add_custom_quant_annotations(custom_quant_annotations)
Expand Down
8 changes: 7 additions & 1 deletion backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from executorch.exir import ExirExportedProgram
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.lowered_backend_module import LoweredBackendModule
from executorch.exir.program._program import _get_updated_graph_signature
from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
from torch.export.exported_program import ExportedProgram
from torch.fx import passes
Expand Down Expand Up @@ -223,7 +224,12 @@ def capture_program(
core_ep.transform(ConvertBinaryOpsWithScalar())
edge_ep = core_ep.to_edge(qnn_edge_config())
_transform(edge_ep.exported_program)

# Since QDQ nodes are stripped, update graph signature again to validate program
edge_ep.exported_program._graph_signature = _get_updated_graph_signature(
edge_ep.exported_program.graph_signature,
edge_ep.exported_program.graph_module,
)
edge_ep.exported_program._validate()
return edge_ep


Expand Down
4 changes: 3 additions & 1 deletion examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ target_link_libraries(
qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
${FLATCCRT_LIB} gflags
)
target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
endif()

# build llama runner
add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
Expand Down
18 changes: 2 additions & 16 deletions examples/qualcomm/llama2/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import getpass
import json
import os
import shutil
import stat
import time
from multiprocessing.connection import Client

Expand Down Expand Up @@ -62,7 +60,6 @@ def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
"""
This function is specific for matmul op 16a8w.
"""
from typing import Sequence

from executorch.backends.qualcomm.quantizer.quantizer import (
get_16a8w_qnn_ptq_config,
Expand Down Expand Up @@ -294,9 +291,9 @@ def quantize(self, quant_dtype, custom_annotations=()):
fx_graph_module = None

with torch.no_grad():
fx_graph_module = torch._export.capture_pre_autograd_graph(
fx_graph_module = torch.export.export(
self.llama_model, self.inputs
)
).module()
fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
print("Quantizing the model...")
calibrate(
Expand Down Expand Up @@ -343,16 +340,6 @@ def lowering_modules(
constant_methods=self.llama_meta,
compile_config=EdgeCompileConfig(_check_ir_validity=False),
)

setattr(
edge_prog_mgr.exported_program(),
"_graph_signature",
_get_updated_graph_signature(
edge_prog_mgr.exported_program().graph_signature,
edge_prog_mgr.exported_program().graph_module,
),
)

edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
Expand Down Expand Up @@ -520,7 +507,6 @@ def post_process():
"-P",
"--ptq",
help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
required=True,
default="16a4w",
)

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/qaihub_runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/runner_util/managed_tensor.h>

#include <ctime>
Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/qaihub_runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include <unordered_map>

#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/scripts/export_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
quantizer.set_bit8_op_quant_config(quant_config)

# Typical pytorch 2.0 quantization flow
m = torch._export.capture_pre_autograd_graph(model.eval(), example_inputs)
m = torch.export.export(model.eval(), example_inputs).module()
m = prepare_pt2e(m, quantizer)
# Calibration
m(*example_inputs)
Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def build_executorch_binary(
else:
raise AssertionError(f"No support for QuantDtype {quant_dtype}.")

captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
captured_model = torch.export.export(model, inputs).module()
annotated_model = prepare_pt2e(captured_model, quantizer)
print("Quantizing the model...")
# calibration
Expand Down

0 comments on commit 0e2b205

Please sign in to comment.