Skip to content

Commit

Permalink
Merge branch 'master' into img_patch29_qs8_vcvt
Browse files Browse the repository at this point in the history
  • Loading branch information
KaustubhIMG authored Sep 26, 2024
2 parents 699384e + 2286715 commit 0de259e
Show file tree
Hide file tree
Showing 49 changed files with 2,128 additions and 91 deletions.
2 changes: 2 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,8 @@ xnnpack_cxx_library(
"@KleidiAI//kai/ukernels/matmul",
"@KleidiAI//kai/ukernels/matmul:rhs_pack_kxn_qsi4cxp_qsu4cxs1s0",
"@KleidiAI//kai/ukernels/matmul:rhs_pack_nxk_qsi4cxp_qsu4cxs1s0",
"@KleidiAI//kai/ukernels/matmul:rhs_pack_nxk_qsi4c32p_qsu4c32s1s0",
"@KleidiAI//kai/ukernels/matmul:rhs_pack_kxn_qsi4c32p_qsu4c32s1s0",
]),
)

Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1596,6 +1596,7 @@ IF(XNNPACK_BUILD_TESTS)
qd8-f32-qc4w-gemm-minmax
qd8-f32-qc8w-igemm-minmax
qp8-f32-qc4w-gemm-minmax
qp8-f32-qb4w-gemm-minmax
qs8-qc8w-gemm-minmax-fp32
qs8-qc8w-igemm-minmax-fp32
qu8-gemm-minmax-fp32
Expand Down Expand Up @@ -2008,6 +2009,7 @@ IF(XNNPACK_BUILD_BENCHMARKS)
qd8-f32-qc4w-gemm
qd8-f32-qc8w-gemm
qp8-f32-qc4w-gemm
qp8-f32-qb4w-gemm
qs16-qs8-vcvt
qs8-dwconv
qs8-f16-vcvt
Expand Down
125 changes: 124 additions & 1 deletion bench/gemm-benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,130 @@ void GEMMBenchmark(benchmark::State& state,
benchmark::Counter::kIsRate);
}


void GEMMBenchmark(benchmark::State& state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_fn gemm,
xnn_init_f32_qb4w_minmax_params_fn init_params,
xnn_pack_weights_and_biases_fn pack_weights,
xnn_packed_stride_weights_and_biases_fn packed_stride,
size_t mr, size_t nr, size_t kr, size_t sr, size_t mr_packed,
benchmark::utils::IsaCheckFunction isa_check) {
if (isa_check != nullptr && !isa_check(state)) {
return;
}

const size_t mc = state.range(0);
const size_t nc = state.range(1);
const size_t bl = state.range(3);
const size_t kc = round_up(state.range(2), 2UL);

std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f),
std::ref(rng));
auto u8rng = std::bind(std::uniform_int_distribution<int32_t>(
0, std::numeric_limits<uint8_t>::max()),
std::ref(rng));
auto scalerng = std::bind(std::uniform_real_distribution<float>(0.5f, 2.f),
std::ref(rng));

const size_t k2 = round_up_po2(kc, 2); // tester assumes byte aligned rows

std::vector<float> a(mc * k2);
std::generate(a.begin(), a.end(), std::ref(f32rng));
std::vector<uint8_t> k(nc * k2 / 2);
std::generate(k.begin(), k.end(), std::ref(u8rng));

// Create a fake `gemm_config` for the packing functions.
struct xnn_gemm_config gemm_config;
gemm_config.mr = static_cast<uint8_t>(mr);
gemm_config.mr_packed = static_cast<uint8_t>(mr_packed);
gemm_config.nr = static_cast<uint8_t>(nr);
gemm_config.log2_kr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(kr));
gemm_config.log2_sr = static_cast<uint8_t>(31 - math_clz_nonzero_u32(sr));

const size_t packed_w_stride =
packed_stride(&gemm_config, k2, /*k_stride=*/bl, /*extra_bytes=*/0);
const size_t packed_w_size = packed_w_stride * round_up(nc, nr);

const size_t c_elements = mc * nc;
const size_t num_buffers =
1 + benchmark::utils::DivideRoundUp<size_t>(
benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (packed_w_size + c_elements));

std::vector<char, AlignedAllocator<char, 64>> w(packed_w_size * num_buffers);
std::fill(w.begin(), w.end(), 0);

// Quantize the left-hand operand.
const size_t input_packed_size =
xnn_x8_packq_f32qp8_packed_size(mc, k2, mr_packed, kr, sr);
std::vector<int8_t> input_qp8(input_packed_size);
xnn_x8_packq_f32qp8_ukernel__scalar_u1(mc, k2, mr_packed, kr, sr,
/*m_idx_start=*/0, a.data(),
/*lhs_stride=*/k2 * sizeof(float),
input_qp8.data());

// RHS packing
std::vector<xnn_float16> kernel_scale2d(nc * k2 / bl);
std::generate(kernel_scale2d.begin(), kernel_scale2d.end(),
[&]() { return math_cvt_bf16_fp32(scalerng()); });
const xnn_qs8_qc4w_packing_params packing_params = {/*input_zero_point=*/1,
/*kernel_zero_point=*/8};
pack_weights(/*flags=*/0, &gemm_config, k2, nc,
/*groups=*/1, /*k_stride=*/bl,
/*accumulator_init=*/nullptr,
/*weights=*/k.data(),
/*int_extra_data0_fn=*/nullptr,
/*extra_data0=*/nullptr,
/*extra_data0_size=*/0,
/*init_extra_data1_fn=*/
nullptr,
/*extra_data1=*/kernel_scale2d.data(),
/*extra_data1_size=*/sizeof(float),
/*packed_weights_ptr=*/w.data(), &packing_params);

std::vector<float> c(c_elements * num_buffers);
std::fill(c.begin(), c.end(), std::nanf(""));

// Prepare parameters.
xnn_f32_qb4w_minmax_params minmax_params;
init_params(&minmax_params, std::numeric_limits<int8_t>::min(),
std::numeric_limits<int8_t>::max(), 8, bl);

size_t buffer_index = 0;
for (auto _ : state) {
// Use circular buffers (exceeding cache size) and prefetch to control cache
// state:
// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
// - W is not in cache (for any cache level)
// - C is not in cache (for any cache level)
state.PauseTiming();
benchmark::utils::PrefetchToL1(a.data(), a.size());
buffer_index = (buffer_index + 1) % num_buffers;
state.ResumeTiming();

for (uint32_t m = 0; m < mc; m += mr) {
const uint32_t mb = min(mc - m, mr);
gemm(mb, nc, kc,
input_qp8.data() +
xnn_x8_packq_f32qp8_packed_offset(m, kc, mr, kr, sr),
w.data() + packed_w_size * buffer_index,
c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float),
sizeof(float), &minmax_params);
}
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

state.counters["OPS"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * 2 * mc * nc * kc,
benchmark::Counter::kIsRate);
}

void GEMMBenchmark(benchmark::State& state, xnn_qu8_gemm_minmax_ukernel_fn gemm,
xnn_init_qu8_conv_minmax_params_fn init_params,
xnn_pack_qu8_gemm_fn pack, size_t mr, size_t nr, size_t kr,
Expand Down Expand Up @@ -1194,4 +1318,3 @@ void GEMMBenchmark(benchmark::State& state, xnn_f16_gemm_minmax_ukernel_fn gemm,
benchmark::Counter(uint64_t(state.iterations()) * 2 * mc * nc * kc,
benchmark::Counter::kIsRate);
}

9 changes: 9 additions & 0 deletions bench/gemm-benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ void GEMMBenchmark(benchmark::State& state,
size_t nr, size_t kr, size_t sr, size_t mr_packed,
benchmark::utils::IsaCheckFunction isa_check);

void GEMMBenchmark(benchmark::State& state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_fn gemm,
xnn_init_f32_qb4w_minmax_params_fn init_params,
xnn_pack_weights_and_biases_fn pack_weights,
xnn_packed_stride_weights_and_biases_fn packed_stride,
size_t mr,
size_t nr, size_t kr, size_t sr, size_t mr_packed,
benchmark::utils::IsaCheckFunction isa_check);

void GEMMBenchmark(benchmark::State& state, xnn_qu8_gemm_minmax_ukernel_fn gemm,
xnn_init_qu8_conv_minmax_params_fn init_params,
xnn_pack_qu8_gemm_fn pack, size_t mr, size_t nr, size_t kr,
Expand Down
4 changes: 2 additions & 2 deletions bench/models/qs8-mobilenet-v2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ xnn_subgraph_t QS8MobileNetV2() {
subgraph, xnn_datatype_fp32,
v0_dims.size(), v0_dims.data(),
/*data=*/nullptr,
1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0);
0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0);
if (status != xnn_status_success) {
std::cerr << "failed to create tensor v0" << std::endl;
return nullptr;
Expand Down Expand Up @@ -963,7 +963,7 @@ xnn_subgraph_t QS8MobileNetV2() {
subgraph, xnn_datatype_fp32,
v66_dims.size(), v66_dims.data(),
/*data=*/nullptr,
0, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66);
1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v66);
if (status != xnn_status_success) {
std::cerr << "failed to create tensor v66" << std::endl;
return nullptr;
Expand Down
86 changes: 86 additions & 0 deletions bench/qp8-f32-qb4w-gemm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
//
// Auto-generated file. Do not edit!
// Specification: test/qp8-f32-qb4w-gemm-minmax.yaml
// Generator: tools/generate-gemm-test.py

#include <benchmark/benchmark.h>
#include "bench/gemm-benchmark.h"
#include "bench/utils.h"
#include "xnnpack/common.h"
#include "xnnpack/gemm.h"
#include "xnnpack/isa-checks.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/microparams-init.h"
#include "xnnpack/pack.h"
#include "xnnpack/packw.h"


#if XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64
#if XNN_ENABLE_KLEIDIAI
static void qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__aarch64_neoni8mm(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__neoni8mm,
xnn_init_f32_qb4w_minmax_scalar_params,
xnn_pack_kai_qb4_weights_and_biases,
xnn_packed_stride_kai_qb4_weights_and_biases,
/*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/2,
/*mr_packed=*/4,
benchmark::utils::CheckNEONI8MM);
}

BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__aarch64_neoni8mm)

static void qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__aarch64_neoni8mm_mstep2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__neoni8mm_mstep2,
xnn_init_f32_qb4w_minmax_scalar_params,
xnn_pack_kai_qb4_weights_and_biases,
xnn_packed_stride_kai_qb4_weights_and_biases,
/*mr=*/8, /*nr=*/4, /*kr=*/16, /*sr=*/2,
/*mr_packed=*/4,
benchmark::utils::CheckNEONI8MM);
}

BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__aarch64_neoni8mm_mstep2)
#endif // XNN_ENABLE_KLEIDIAI
#endif // XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64


#if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64
#if XNN_ENABLE_KLEIDIAI
static void qp8_f32_qb4w_gemm_minmax_ukernel_1x4c16s2__aarch64_neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_1x4c16s2__aarch64_neondot,
xnn_init_f32_qb4w_minmax_scalar_params,
xnn_pack_kai_qb4_weights_and_biases,
xnn_packed_stride_kai_qb4_weights_and_biases,
/*mr=*/1, /*nr=*/4, /*kr=*/16, /*sr=*/2,
/*mr_packed=*/1,
benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_1x4c16s2__aarch64_neondot)

static void qp8_f32_qb4w_gemm_minmax_ukernel_1x8c16s2__aarch64_neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qb4w_gemm_minmax_ukernel_1x8c16s2__aarch64_neondot,
xnn_init_f32_qb4w_minmax_scalar_params,
xnn_pack_kai_qb4_weights_and_biases,
xnn_packed_stride_kai_qb4_weights_and_biases,
/*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/2,
/*mr_packed=*/1,
benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_1x8c16s2__aarch64_neondot)
#endif // XNN_ENABLE_KLEIDIAI
#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64


#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
8 changes: 4 additions & 4 deletions build_params.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -436,17 +436,15 @@ XNNPACK_PARAMS_FOR_ARCH = {
"//build_config:aarch64": ["-march=armv8.2-a+dotprod"],
"//conditions:default": [],
}),
extra_deps = xnnpack_if_kleidiai_enabled([
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod",
]),
),
"neondot_aarch64": _create_params(
cond = "//:arm_aarch64_dotprod_enabled",
copts = ["-march=armv8.2-a+dotprod"],
extra_deps = xnnpack_if_kleidiai_enabled([
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod",
]),
),
"neoni8mm": _create_params(
Expand All @@ -457,6 +455,8 @@ XNNPACK_PARAMS_FOR_ARCH = {
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm",
"@KleidiAI//kai/ukernels/matmul:clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm",
]),
),
"aarch32": _create_params(
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/neondot_aarch64_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@


SET(PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x4c16s2-aarch64-neondot.c
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x8c16s2-aarch64-neondot.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x8c16s2-aarch64-neondot.c)

SET(NON_PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/neoni8mm_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ SET(PROD_NEONI8MM_MICROKERNEL_SRCS
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c8-minmax-neoni8mm.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c8-minmax-neoni8mm.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c8-minmax-neoni8mm.c
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-8x4c16s2-mstep2-neoni8mm.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x8c16s2-mstep2-neoni8mm.c
src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-neoni8mm.c
src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c8-minmax-fp32-neoni8mm.c
Expand Down Expand Up @@ -185,6 +186,7 @@ SET(NON_PROD_NEONI8MM_MICROKERNEL_SRCS
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-6x16c8-minmax-neoni8mm.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x8c8-minmax-neoni8mm.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c8-minmax-neoni8mm.c
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-4x8c16s2-neoni8mm.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-4x4c16s2-neoni8mm.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-4x8c16s2-neoni8mm.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x4c16s2-mstep2-neoni8mm.c
Expand Down
4 changes: 4 additions & 0 deletions cmake/gen/rvv_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ SET(PROD_RVV_MICROKERNEL_SRCS
src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c
src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c
src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c
src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c
src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c
src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c
src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c
src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c
src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c
src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c
src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c
Expand Down Expand Up @@ -180,9 +182,11 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c
src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c
src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c
src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c
src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c
src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c
src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c
src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c
src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c
Expand Down
2 changes: 2 additions & 0 deletions gen/neondot_aarch64_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Auto-generated file. Do not edit!
"""

PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x4c16s2-aarch64-neondot.c",
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x8c16s2-aarch64-neondot.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x8c16s2-aarch64-neondot.c",
]

Expand Down
2 changes: 2 additions & 0 deletions gen/neoni8mm_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ PROD_NEONI8MM_MICROKERNEL_SRCS = [
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c8-minmax-neoni8mm.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c8-minmax-neoni8mm.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c8-minmax-neoni8mm.c",
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-8x4c16s2-mstep2-neoni8mm.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x8c16s2-mstep2-neoni8mm.c",
"src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-neoni8mm.c",
"src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c8-minmax-fp32-neoni8mm.c",
Expand Down Expand Up @@ -182,6 +183,7 @@ NON_PROD_NEONI8MM_MICROKERNEL_SRCS = [
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-6x16c8-minmax-neoni8mm.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x8c8-minmax-neoni8mm.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c8-minmax-neoni8mm.c",
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-4x8c16s2-neoni8mm.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-4x4c16s2-neoni8mm.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-4x8c16s2-neoni8mm.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x4c16s2-mstep2-neoni8mm.c",
Expand Down
Loading

0 comments on commit 0de259e

Please sign in to comment.