diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake index 40e4e72c38b..b76c54bee60 100644 --- a/backends/xnnpack/cmake/Dependencies.cmake +++ b/backends/xnnpack/cmake/Dependencies.cmake @@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "" ) +set(XNNPACK_ENABLE_KLEIDIAI + OFF + CACHE BOOL "" +) add_subdirectory("${XNNPACK_SOURCE_DIR}") include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR}) list(APPEND xnnpack_third_party XNNPACK) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index ac53831b04c..6fae9f4221b 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -21,6 +21,25 @@ namespace executor { namespace xnnpack { namespace delegate { +/* + * Provide compile-time allocation. + */ +class CompileAllocator { + public: + /* + * Allocate memory which will be automatically freed at the end + * of the compilation process. + */ + void* allocateTemporary(size_t size) { + auto mem = new uint8_t[size]; + temporaries_.emplace_back(mem); + return mem; + } + + private: + std::vector> temporaries_; +}; + // Flatbuffer types using ValuePtr = const fb_xnnpack::XValue*; using NodePtr = const fb_xnnpack::XNode*; @@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)( const std::unordered_map&, NodePtr) noexcept; +/* +Convert a tensor from fp32 to bf16. +*/ +void convertF32TensorToBF16( + const float* f32_data, + uint16_t* bf16_data_out, + size_t numel) { + for (auto i = 0u; i < numel; i++) { + // Adjust the f32 value such that it rounds properly after truncation. + // Constant factor scales 1+2^-8 to 1+2e-7. + float f32_adjusted = f32_data[i] * 1.00389105f; + uint32_t f32_bits; + memcpy(&f32_bits, &f32_adjusted, sizeof(float)); + bf16_data_out[i] = static_cast(f32_bits >> 16); + } +} + /* Gets the output min and output max for a given node operator */ @@ -152,7 +188,8 @@ Error defineTensor( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, std::vector& input_ids, - std::vector& output_ids) { + std::vector& output_ids, + CompileAllocator& allocator) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -356,12 +393,31 @@ Error defineTensor( size_t group_size = qparams->group_size(); size_t output_channels = tensor_value->dims()->Get(0); size_t input_channels = tensor_value->dims()->Get(1); + + const uint16_t* scale_data = nullptr; + uint32_t scale_numel = 0; + + // Block scales are preferably serialized as bf16 but can also be + // serialized as fp32 for backwards compatability. + if (qparams->scale_bf16() != nullptr) { + scale_data = + static_cast(qparams->scale_bf16()->data()); + scale_numel = qparams->scale_bf16()->size(); + } else { + // Read fp32 scales, convert to bf16. + auto conv_buffer = static_cast(allocator.allocateTemporary( + qparams->scale()->size() * sizeof(uint16_t))); + scale_numel = qparams->scale()->size(); + convertF32TensorToBF16( + qparams->scale()->data(), conv_buffer, scale_numel); + scale_data = conv_buffer; + } + ET_CHECK_OR_RETURN_ERROR( - qparams->scale()->size() == - output_channels * input_channels / group_size, + scale_numel == output_channels * input_channels / group_size, Internal, "scale size %zu != output channels %zu * group size %zu", - (size_t)qparams->scale()->size(), + static_cast(scale_numel), output_channels, group_size); int32_t zero_point = @@ -370,18 +426,19 @@ Error defineTensor( Debug, "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n", buffer_ptr, - qparams->scale()->size(), + scale_numel, qparams->channel_dim(), group_size, output_channels, datatype, zero_point, datatype); + status = xnn_define_blockwise_quantized_tensor_value( /*subgraph=*/subgraph_ptr, /*datatype=*/datatype, /*zero_point=*/zero_point, - /*scale=*/qparams->scale()->data(), + /*scale=*/scale_data, /*num_dims=*/tensor_value->num_dims(), /*channel_dim=*/qparams->channel_dim(), /*block_size=*/qparams->group_size(), @@ -1617,6 +1674,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; + CompileAllocator compile_allocator; // Header status can only either be Error::Ok or Error::NotFound if (header.ok()) { @@ -1688,7 +1746,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( flatbuffer_graph, constant_data, input_ids, - output_ids); + output_ids, + compile_allocator); if (err != Error::Ok) { return err; diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index f32e7c60637..efe717e085e 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -63,6 +63,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table XNNTensorValue { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index 773a459bbf6..33571195d63 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -48,6 +48,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table PerChannelQuant { diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py index d886ce26694..d8de79f283d 100644 --- a/backends/xnnpack/test/ops/linear.py +++ b/backends/xnnpack/test/ops/linear.py @@ -407,8 +407,8 @@ def test_qd8_per_channel_linear_parallel_and_sequential(self): ) def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: @@ -430,8 +430,8 @@ def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): ) def test_qd8_fp16_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: @@ -602,8 +602,8 @@ def _test_groupwise_dq_linear( use_bias: bool = False, group_size: int = 8, num_linears: int = 1, - atol: float = 1e-3, - rtol: float = 1e-3, + atol: float = 5e-3, + rtol: float = 5e-3, ): quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) unwrap_tensor_subclass(mod) diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 1d139a3b4b7..87ee0b46b83 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 1d139a3b4b7155889c88c31f370a82c48e7ca89c +Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3 diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py index bda79527178..e9b23e4a784 100644 --- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py +++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import print_function +from pathlib import Path import collections import os import sys @@ -36,8 +37,8 @@ "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)", "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)", @@ -46,7 +47,7 @@ # add non-prod microkernel sources here: } -SRC_NAMES = set([ +SRC_NAMES = { "OPERATOR_SRCS", "SUBGRAPH_SRCS", "LOGGING_SRCS", @@ -81,30 +82,42 @@ "PROD_AVX512F_MICROKERNEL_SRCS", "PROD_AVX512SKX_MICROKERNEL_SRCS", "PROD_AVX512VBMI_MICROKERNEL_SRCS", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_AVX512VNNI_MICROKERNEL_SRCS", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_RVV_MICROKERNEL_SRCS", "PROD_AVXVNNI_MICROKERNEL_SRCS", "AARCH32_ASM_MICROKERNEL_SRCS", "AARCH64_ASM_MICROKERNEL_SRCS", # add non-prod microkernel sources here: -]) +} def handle_singleline_parse(line): start_index = line.find("(") end_index = line.find(")") line = line[start_index+1:end_index] key_val = line.split(" ") - return key_val[0], list(map(lambda x: x[4:], key_val[1:])) + return key_val[0], [x[4:] for x in key_val[1:]] def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"): + print(f"Updating sources from {cmakefile}") sources = collections.defaultdict(list) with open(os.path.join(xnnpack_path, cmakefile)) as cmake: lines = cmake.readlines() i = 0 while i < len(lines): line = lines[i] + + if lines[i].startswith("INCLUDE"): + file, _ = handle_singleline_parse(line) + if file.startswith("cmake/gen/"): + path = Path(xnnpack_path) / "XNNPACK" / file + local_sources = update_sources(xnnpack_path, path.absolute().as_posix()) + for k,v in local_sources.items(): + if k in sources: + sources[k] = sources[k] + local_sources[k] + else: + sources[k] = local_sources[k] if lines[i].startswith("SET") and "src/" in lines[i]: name, val = handle_singleline_parse(line) @@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path): xnnpack_sources = collections.defaultdict(list) sources = update_sources(xnnpack_path) - microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake") + microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake") for key in microkernels_sources: sources[key] = microkernels_sources[key] @@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path): def main(argv): + print("Generating wrappers...") + if argv is None or len(argv) == 0: gen_wrappers(".") else: diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl index a1add446643..d2446a47b2a 100644 --- a/backends/xnnpack/third-party/xnnpack.buck.bzl +++ b/backends/xnnpack/third-party/xnnpack.buck.bzl @@ -1,7 +1,6 @@ load("//third-party:glob_defs.bzl", "subdir_glob") load( ":xnnpack_src_defs.bzl", - "JIT_SRCS", "LOGGING_SRCS", "OPERATOR_SRCS", "SUBGRAPH_SRCS", @@ -69,27 +68,6 @@ def define_xnnpack(): ], ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. - native.cxx_library( - name = "jit_memory", - srcs = JIT_SRCS, - headers = subdir_glob([ - ("XNNPACK/src", "**/*.h"), - ]), - header_namespace = "", - compiler_flags = [ - "-std=c++17", - ], - preferred_linkage = "static", - preprocessor_flags = [ - "-DXNN_LOG_LEVEL=0", - ], - exported_deps = [ - ":clog", - ":interface", - ], - ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. native.cxx_library( name = "operators", @@ -139,7 +117,6 @@ def define_xnnpack(): preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_SPARSE=0", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0", "-DXNN_ENABLE_MEMOPT", @@ -1223,7 +1200,6 @@ def define_xnnpack(): ] ARM_XNNPACK_DEPS = [ - ":jit_memory", ":ukernels_armsimd32", ":ukernels_fp16arith", ":ukernels_asm", @@ -1246,7 +1222,6 @@ def define_xnnpack(): "XNNPACK/src/configs/hardware-config.c", "XNNPACK/src/microparams-init.c", "XNNPACK/src/operator-run.c", - "XNNPACK/src/operators/post-operation.c", "XNNPACK/src/microkernel-utils.c", ], headers = subdir_glob([ @@ -1271,7 +1246,6 @@ def define_xnnpack(): "-DXNN_NO_X8_OPERATORS", "-DXNN_ENABLE_MEMOPT", "-DXNN_ENABLE_SPARSE=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_ASSEMBLY", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION", "-DXNN_ENABLE_ARM_DOTPROD", diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl index 0a0beba7efd..751b372770d 100644 --- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl @@ -493,30 +493,18 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] XNNPACK_SRCS = [ diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl index 2dbb41ff01b..76788517873 100644 --- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl @@ -447,28 +447,16 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ]