Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Qualcomm AI Engine Direct - add program validation #4297

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ endif()
if(CMAKE_BUILD_TYPE STREQUAL "Release")
# strip symbols
add_link_options("-s")
# hide dynamic symbols
set(CMAKE_C_VISIBILITY_PRESET hidden)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)

# --gc-sections is added by torch.
add_compile_options(
Expand Down
4 changes: 4 additions & 0 deletions backends/qualcomm/builders/op_prelu.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def define_node(
coeff_node = node.args[1]
coeff_tensor = torch.zeros(input_node.meta["val"].shape)
coeff = get_parameter(coeff_node, self.edge_program)
# param nodes will be FakeTensor when doing partition
# fill in random numeric for validation
if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
coeff = torch.ones(coeff.shape)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is coeff inserted to graph or just an intermediate tensor? If it's inserted, we may need to lift it to the i/o?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's inserted, but we make it static inside the graph for it could be identified when building operator. This can reduce extra memory copies, I think.

# per-channel activation
if coeff_node.meta["val"].shape[0] > 1:
for i in range(input_node.meta["val"].shape[1]):
Expand Down
16 changes: 11 additions & 5 deletions backends/qualcomm/builders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@
from typing import Dict, Optional

import torch
from torch._export.utils import get_buffer, get_param, is_buffer, is_param
from torch._export.utils import (
get_buffer,
get_lifted_tensor_constant,
get_param,
is_buffer,
is_lifted_tensor_constant,
is_param,
)


def is_parameter(
Expand All @@ -16,7 +23,7 @@ def is_parameter(
return (
is_param(edge_program, node)
or is_buffer(edge_program, node)
or node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants
or is_lifted_tensor_constant(edge_program, node)
)


Expand All @@ -28,9 +35,8 @@ def get_parameter(
param = get_param(edge_program, node)
if is_buffer(edge_program, node):
param = get_buffer(edge_program, node)
if node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[node.name]
param = edge_program.constants[name]
if is_lifted_tensor_constant(edge_program, node):
param = get_lifted_tensor_constant(edge_program, node)
if param is not None:
# update node.meta["val"] to qualified QNN datatype (e.g. i64 to i32)
assert isinstance(param, torch.Tensor), "Expect parameter to be tensor"
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/passes/recompose_pixel_unshuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, quantization_capture=False):

self.quantization_capture = quantization_capture
if quantization_capture:
self.reshape_target = torch.ops.aten.reshape.default
self.reshape_target = torch.ops.aten._unsafe_view.default
self.permute_target = torch.ops.aten.permute.default
self.view_target = torch.ops.aten.view.default
self.op = torch.ops.aten.pixel_unshuffle.default
Expand Down
7 changes: 5 additions & 2 deletions backends/qualcomm/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ CMAKE_X86_64="build_x86_64"
BUILD_AARCH64="true"
CMAKE_AARCH64="build_android"
CLEAN="true"
BUILD_TYPE="Debug"

if [ -z PYTHON_EXECUTABLE ]; then
PYTHON_EXECUTABLE="python3"
Expand All @@ -38,7 +39,7 @@ if [ -z BUCK2 ]; then
BUCK2="buck2"
fi

long_options=skip_x86_64,skip_aarch64,no_clean
long_options=skip_x86_64,skip_aarch64,no_clean,release

parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@")
eval set -- "$parsed_args"
Expand All @@ -49,6 +50,7 @@ while true ; do
--skip_x86_64) BUILD_X86_64="false"; shift;;
--skip_aarch64) BUILD_AARCH64="false"; shift;;
--no_clean) CLEAN="false"; shift;;
--release) BUILD_TYPE="Release"; shift;;
--) shift; break;;
esac
done
Expand All @@ -66,9 +68,9 @@ if [ "$BUILD_AARCH64" = true ]; then
fi

cd $BUILD_ROOT
# If we build debug type, we need to change flatcc to flatcc_d
cmake .. \
-DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DEXECUTORCH_BUILD_QNN=ON \
-DEXECUTORCH_BUILD_SDK=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
Expand All @@ -87,6 +89,7 @@ if [ "$BUILD_AARCH64" = true ]; then

cmake $PRJ_ROOT/$EXAMPLE_ROOT \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DANDROID_ABI='arm64-v8a' \
-DANDROID_NATIVE_API_LEVEL=23 \
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def get_qdq_module(
custom_quant_annotations: Tuple[Callable] = (),
quant_dtype: QuantDtype = QuantDtype.use_8a8w,
) -> torch.fx.GraphModule:
m = torch._export.capture_pre_autograd_graph(module, inputs)
m = torch.export.export(module, inputs).module()

quantizer = QnnQuantizer()
quantizer.add_custom_quant_annotations(custom_quant_annotations)
Expand Down
8 changes: 7 additions & 1 deletion backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from executorch.exir import ExirExportedProgram
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.lowered_backend_module import LoweredBackendModule
from executorch.exir.program._program import _get_updated_graph_signature
from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
from torch.export.exported_program import ExportedProgram
from torch.fx import passes
Expand Down Expand Up @@ -223,7 +224,12 @@ def capture_program(
core_ep.transform(ConvertBinaryOpsWithScalar())
edge_ep = core_ep.to_edge(qnn_edge_config())
_transform(edge_ep.exported_program)

# Since QDQ nodes are stripped, update graph signature again to validate program
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this diff - is it possible to run a mix of fp ops and quantized ops? Does it support well? The reason I'm asking is because we're removing all q/dq ops and the insert back to the i/o. It may limit us to do mixed dtypes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently mixed precision only supports for quantized ops since compiler spec for HTP precision (quantized or fp16) is on graph level granularity.
We'll have multi-graphs change in the near future, hopefully some mechanisms like weight sharing / fp mixed precision could be well-supported at that time.

Copy link
Collaborator Author

@haowhsu-quic haowhsu-quic Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To do so, it would be great if the framework interface can provide runtime option (like having an argument in method::execute()) for backend to react: e.g. change performance config, select which graph in the context to be executed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we've been discussing how to pass the runtime option at the interface. Ideally passed it by the backend context. Question: do you need it to be method::init time or method::execute time?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

method::execute looks more flexible on QNN. Look forward to the change!

edge_ep.exported_program._graph_signature = _get_updated_graph_signature(
edge_ep.exported_program.graph_signature,
edge_ep.exported_program.graph_module,
)
edge_ep.exported_program._validate()
return edge_ep


Expand Down
4 changes: 3 additions & 1 deletion examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ target_link_libraries(
qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
${FLATCCRT_LIB} gflags
)
target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
endif()

# build llama runner
add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
Expand Down
18 changes: 2 additions & 16 deletions examples/qualcomm/llama2/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import getpass
import json
import os
import shutil
import stat
import time
from multiprocessing.connection import Client

Expand Down Expand Up @@ -62,7 +60,6 @@ def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
"""
This function is specific for matmul op 16a8w.
"""
from typing import Sequence

from executorch.backends.qualcomm.quantizer.quantizer import (
get_16a8w_qnn_ptq_config,
Expand Down Expand Up @@ -294,9 +291,9 @@ def quantize(self, quant_dtype, custom_annotations=()):
fx_graph_module = None

with torch.no_grad():
fx_graph_module = torch._export.capture_pre_autograd_graph(
fx_graph_module = torch.export.export(
self.llama_model, self.inputs
)
).module()
fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
print("Quantizing the model...")
calibrate(
Expand Down Expand Up @@ -343,16 +340,6 @@ def lowering_modules(
constant_methods=self.llama_meta,
compile_config=EdgeCompileConfig(_check_ir_validity=False),
)

setattr(
edge_prog_mgr.exported_program(),
"_graph_signature",
_get_updated_graph_signature(
edge_prog_mgr.exported_program().graph_signature,
edge_prog_mgr.exported_program().graph_module,
),
)

edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
Expand Down Expand Up @@ -520,7 +507,6 @@ def post_process():
"-P",
"--ptq",
help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
required=True,
default="16a4w",
)

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/qaihub_runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/runner_util/managed_tensor.h>

#include <ctime>
Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/qaihub_runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include <unordered_map>

#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/scripts/export_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
quantizer.set_bit8_op_quant_config(quant_config)

# Typical pytorch 2.0 quantization flow
m = torch._export.capture_pre_autograd_graph(model.eval(), example_inputs)
m = torch.export.export(model.eval(), example_inputs).module()
m = prepare_pt2e(m, quantizer)
# Calibration
m(*example_inputs)
Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def build_executorch_binary(
else:
raise AssertionError(f"No support for QuantDtype {quant_dtype}.")

captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
captured_model = torch.export.export(model, inputs).module()
annotated_model = prepare_pt2e(captured_model, quantizer)
print("Quantizing the model...")
# calibration
Expand Down
9 changes: 1 addition & 8 deletions exir/backend/backend_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,7 @@ def to_backend(
ExportedProgram: The input program, with some portions targeted for delegation.
"""
# Use fake program, with FakeTensors in the state dict, to avoid copying large constant values.
# Fall back to deepcopy if no fake mode is found. TODO(T182910699): Remove this fallback.
try:
fake_edge_program = get_fake_program(edge_program)
except Exception as e:
logging.warning(
f"Error in get_fake_program for graph {edge_program.graph_module}, fallback to deepcopy: {e}"
)
fake_edge_program = copy.deepcopy(edge_program)
fake_edge_program = get_fake_program(edge_program)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change can be separate - we may need to test it with broader tests in case some paths still rely on the fallback. Glad to see qnn path can work with the fake edge program! I believe the RAM usage will go down quite a bit now.

partitioner_result = partitioner_instance(fake_edge_program)
tagged_exported_program = partitioner_result.tagged_exported_program

Expand Down
66 changes: 16 additions & 50 deletions sdk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,25 +84,16 @@ option(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT
)

if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
# Add the host project. We build this separately so that we can generate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also thanks for fixing the sdk build! cc: @Olivia-liu @tarun292

# headers on the host during the build, even if we're cross-compiling the
# flatcc runtime to a different architecture.

# lint_cmake: -readability/wonkycase
ExternalProject_Add(
flatcc_project
PREFIX ${CMAKE_BINARY_DIR}/_host_build
SOURCE_DIR ${_flatcc_source_dir}
BINARY_DIR ${CMAKE_BINARY_DIR}/_host_build
CMAKE_CACHE_ARGS
-DFLATCC_TEST:BOOL=OFF -DFLATCC_REFLECTION:BOOL=OFF
# See above comment about POSITION_INDEPENDENT_CODE.
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
INSTALL_COMMAND "" # Prevent the install step
execute_process(
COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir}
-DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-B${CMAKE_BINARY_DIR}/_host_build
)
set(_etdump_schema_gen_dep flatcc_project)
execute_process(
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build
)
set(_etdump_schema_gen_dep)
else()
# If we're not cross-compiling, we can just use the plain commandline target.
set(_etdump_schema_gen_dep flatcc_cli)
endif()

Expand Down Expand Up @@ -134,42 +125,11 @@ add_library(
bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
)

# Ensure the host tool is built before the main project
add_dependencies(etdump_schema flatcc_cli)

file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
file(MAKE_DIRECTORY
${_program_schema__include_dir}/executorch/sdk/bundled_program
)

if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
# If we cross-compiling, we need to use the version of the commandline tool
# built for the host.
set(_etdump_schema_gen_dep flatcc_project)

# TODO(dbort): flatcc installs its files directly in its source directory
# instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
# this. We build flatcc twice in the executorch build: once to get the
# `flatcc` host commandline tool, and once to get the (potentially
# cross-compiled) target runtime library. The host build will put its outputs
# in the source tree, making the cross-compiling target build think that the
# outputs have already been built. It will then try to link against the
# host-architecture libraries, failing when cross-compiling. To work around
# this, delete the host outputs after running this command (which only runs
# when setting up the cmake files, not when actually building). This leaves
# room for the target build to put its own files in the source tree. We should
# try to remove this hack, ideally by submitting an upstream PR that adds an
# option to change the installation location.
set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
${_flatcc_source_dir}/lib/*
)
else()
# If we're not cross-compiling we can use the plain commandline target, and we
# don't need to delete any files.
set(_etdump_schema_gen_dep flatcc_cli)
set(_etdump_schema_cleanup_paths "")
endif()

add_custom_command(
OUTPUT ${_etdump_schema__outputs}
COMMAND
Expand All @@ -179,13 +139,19 @@ add_custom_command(
${_flatcc_source_dir}/bin/flatcc -cwr -o
${_program_schema__include_dir}/executorch/sdk/etdump
${_etdump_schema__srcs}
COMMAND rm -f ${_etdump_schema_cleanup_paths}
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
DEPENDS ${_etdump_schema_gen_dep}
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
COMMENT "Generating etdump headers"
VERBATIM
)

add_custom_target(
etdump_schema_generated
DEPENDS ${_etdump_schema__outputs}
)

add_dependencies(etdump_schema etdump_schema_generated)

add_library(
etdump ${CMAKE_CURRENT_SOURCE_DIR}/etdump/etdump_flatcc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/etdump/emitter.cpp
Expand Down
Loading