Qualcomm AI Engine Direct - add program validation (#4297)

Summary: - update graph signature for get_fake_program to work properly - make sure program is valid after capture_program - retire cature_pre_autograd_graph - fix release build error & make cross-compile flatcc deterministic - some minor fix Pull Request resolved: #4297 Reviewed By: kirklandsign Differential Revision: D59984308 Pulled By: cccclai fbshipit-source-id: 9b2a3eaabbc175db6209b12af95639e96fe95b7b
pytorch · Jul 22, 2024 · 0e2b205 · 0e2b205
1 parent f0364e8
commit 0e2b205
Show file tree

Hide file tree

Showing 14 changed files with 39 additions and 35 deletions.
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -63,9 +63,6 @@ endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
   # strip symbols
   add_link_options("-s")
-  # hide dynamic symbols
-  set(CMAKE_C_VISIBILITY_PRESET hidden)
-  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
   # --gc-sections is added by torch.
   add_compile_options(

diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py
@@ -43,6 +43,10 @@ def define_node(
             coeff_node = node.args[1]
             coeff_tensor = torch.zeros(input_node.meta["val"].shape)
             coeff = get_parameter(coeff_node, self.edge_program)
+            # param nodes will be FakeTensor when doing partition
+            # fill in random numeric for validation
+            if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
+                coeff = torch.ones(coeff.shape)
             # per-channel activation
             if coeff_node.meta["val"].shape[0] > 1:
                 for i in range(input_node.meta["val"].shape[1]):

diff --git a/backends/qualcomm/builders/utils.py b/backends/qualcomm/builders/utils.py
@@ -7,7 +7,14 @@
 from typing import Dict, Optional
 
 import torch
-from torch._export.utils import get_buffer, get_param, is_buffer, is_param
+from torch._export.utils import (
+    get_buffer,
+    get_lifted_tensor_constant,
+    get_param,
+    is_buffer,
+    is_lifted_tensor_constant,
+    is_param,
+)
 
 
 def is_parameter(
@@ -16,7 +23,7 @@ def is_parameter(
     return (
         is_param(edge_program, node)
         or is_buffer(edge_program, node)
-        or node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants
+        or is_lifted_tensor_constant(edge_program, node)
     )
 
 
@@ -28,9 +35,8 @@ def get_parameter(
         param = get_param(edge_program, node)
     if is_buffer(edge_program, node):
         param = get_buffer(edge_program, node)
-    if node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
-        name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[node.name]
-        param = edge_program.constants[name]
+    if is_lifted_tensor_constant(edge_program, node):
+        param = get_lifted_tensor_constant(edge_program, node)
     if param is not None:
         # update node.meta["val"] to qualified QNN datatype (e.g. i64 to i32)
         assert isinstance(param, torch.Tensor), "Expect parameter to be tensor"

diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -24,7 +24,7 @@ def __init__(self, quantization_capture=False):
 
         self.quantization_capture = quantization_capture
         if quantization_capture:
-            self.reshape_target = torch.ops.aten.reshape.default
+            self.reshape_target = torch.ops.aten._unsafe_view.default
             self.permute_target = torch.ops.aten.permute.default
             self.view_target = torch.ops.aten.view.default
             self.op = torch.ops.aten.pixel_unshuffle.default

diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
@@ -29,6 +29,7 @@ CMAKE_X86_64="build_x86_64"
 BUILD_AARCH64="true"
 CMAKE_AARCH64="build_android"
 CLEAN="true"
+BUILD_TYPE="Debug"
 
 if [ -z PYTHON_EXECUTABLE ]; then
   PYTHON_EXECUTABLE="python3"
@@ -38,7 +39,7 @@ if [ -z BUCK2 ]; then
   BUCK2="buck2"
 fi
 
-long_options=skip_x86_64,skip_aarch64,no_clean
+long_options=skip_x86_64,skip_aarch64,no_clean,release
 
 parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@")
 eval set -- "$parsed_args"
@@ -49,6 +50,7 @@ while true ; do
         --skip_x86_64) BUILD_X86_64="false"; shift;;
         --skip_aarch64) BUILD_AARCH64="false"; shift;;
         --no_clean) CLEAN="false"; shift;;
+        --release) BUILD_TYPE="Release"; shift;;
         --) shift; break;;
     esac
 done
@@ -66,9 +68,9 @@ if [ "$BUILD_AARCH64" = true ]; then
     fi
 
     cd $BUILD_ROOT
-    # If we build debug type, we need to change flatcc to flatcc_d
     cmake .. \
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -87,6 +89,7 @@ if [ "$BUILD_AARCH64" = true ]; then
 
     cmake $PRJ_ROOT/$EXAMPLE_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \

diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -302,7 +302,7 @@ def get_qdq_module(
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
     ) -> torch.fx.GraphModule:
-        m = torch._export.capture_pre_autograd_graph(module, inputs)
+        m = torch.export.export(module, inputs).module()
 
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_quant_annotations)

diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -58,6 +58,7 @@
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.lowered_backend_module import LoweredBackendModule
+from executorch.exir.program._program import _get_updated_graph_signature
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
@@ -223,7 +224,12 @@ def capture_program(
     core_ep.transform(ConvertBinaryOpsWithScalar())
     edge_ep = core_ep.to_edge(qnn_edge_config())
     _transform(edge_ep.exported_program)
-
+    # Since QDQ nodes are stripped, update graph signature again to validate program
+    edge_ep.exported_program._graph_signature = _get_updated_graph_signature(
+        edge_ep.exported_program.graph_signature,
+        edge_ep.exported_program.graph_module,
+    )
+    edge_ep.exported_program._validate()
     return edge_ep
 
 

diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
@@ -106,7 +106,9 @@ target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
   ${FLATCCRT_LIB} gflags
 )
-target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
+endif()
 
 # build llama runner
 add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})

diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
@@ -8,8 +8,6 @@
 import getpass
 import json
 import os
-import shutil
-import stat
 import time
 from multiprocessing.connection import Client
 
@@ -62,7 +60,6 @@ def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
     """
     This function is specific for matmul op 16a8w.
     """
-    from typing import Sequence
 
     from executorch.backends.qualcomm.quantizer.quantizer import (
         get_16a8w_qnn_ptq_config,
@@ -294,9 +291,9 @@ def quantize(self, quant_dtype, custom_annotations=()):
         fx_graph_module = None
 
         with torch.no_grad():
-            fx_graph_module = torch._export.capture_pre_autograd_graph(
+            fx_graph_module = torch.export.export(
                 self.llama_model, self.inputs
-            )
+            ).module()
             fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
         print("Quantizing the model...")
         calibrate(
@@ -343,16 +340,6 @@ def lowering_modules(
                 constant_methods=self.llama_meta,
                 compile_config=EdgeCompileConfig(_check_ir_validity=False),
             )
-
-            setattr(
-                edge_prog_mgr.exported_program(),
-                "_graph_signature",
-                _get_updated_graph_signature(
-                    edge_prog_mgr.exported_program().graph_signature,
-                    edge_prog_mgr.exported_program().graph_module,
-                ),
-            )
-
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
             with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
@@ -520,7 +507,6 @@ def post_process():
         "-P",
         "--ptq",
         help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
-        required=True,
         default="16a4w",
     )
 

diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.cpp b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
@@ -9,9 +9,9 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <ctime>

diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.h b/examples/qualcomm/llama2/qaihub_runner/runner.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 

diff --git a/examples/qualcomm/llama2/runner/runner.h b/examples/qualcomm/llama2/runner/runner.h
@@ -19,7 +19,7 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 

diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
@@ -66,7 +66,7 @@
     quantizer.set_bit8_op_quant_config(quant_config)
 
     # Typical pytorch 2.0 quantization flow
-    m = torch._export.capture_pre_autograd_graph(model.eval(), example_inputs)
+    m = torch.export.export(model.eval(), example_inputs).module()
     m = prepare_pt2e(m, quantizer)
     # Calibration
     m(*example_inputs)

diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -204,7 +204,7 @@ def build_executorch_binary(
         else:
             raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
 
-        captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        captured_model = torch.export.export(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration