From c9c1afb30d2dfa66afdbb18ddad2858f8239da46 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Thu, 17 Oct 2024 09:17:35 +0200 Subject: [PATCH] backend-cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels --- common/arg.cpp | 7 + common/common.cpp | 3 +- common/common.h | 2 + examples/llama-bench/llama-bench.cpp | 196 +++++++++++++++------------ ggml/include/ggml-backend.h | 1 + ggml/src/ggml-aarch64.c | 99 ++++++++++++++ ggml/src/ggml-aarch64.h | 2 + ggml/src/ggml-backend.cpp | 26 ++++ include/llama.h | 11 +- src/llama.cpp | 4 + 10 files changed, 261 insertions(+), 90 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d6a8e1f6ff0bf8..39ba41b3b16f4c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1993,6 +1993,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex common_log_set_timestamps(common_log_main(), true); } ).set_env("LLAMA_LOG_TIMESTAMPS")); + add_opt(common_arg( + {"-rtrp", "--runtime-repack"}, + string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack), + [](common_params & params) { + params.runtime_repack = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index c08f01b429056e..fa950f98eb03a3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -996,7 +996,7 @@ struct llama_model_params common_model_params_to_llama(const common_params & par mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; - mparams.use_mmap = params.use_mmap; + mparams.use_mmap = params.use_mmap && !params.runtime_repack; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; if (params.kv_overrides.empty()) { @@ -1066,6 +1066,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; + cparams.runtime_repack = params.runtime_repack; if (params.reranking) { cparams.embeddings = true; diff --git a/common/common.h b/common/common.h index 5ca8fd391ab742..b774dae1a37a40 100644 --- a/common/common.h +++ b/common/common.h @@ -265,6 +265,8 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool runtime_repack = false; // runtime repack weight for optimized kernels + std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index c22bdedcfa231e..95f6eb230be1a4 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -244,6 +244,7 @@ struct cmd_params { std::vector> tensor_split; std::vector use_mmap; std::vector embeddings; + std::vector runtime_repack; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -276,6 +277,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* embeddings */ {false}, + /* runtime_repack */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -314,6 +316,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -rtrp, --runtime_repack <0|1> (default: %s)\n", join(cmd_params_defaults.runtime_repack, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); @@ -573,6 +576,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); + } else if (arg == "-rtrp" || arg == "--runtime_repack") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.runtime_repack.insert(params.runtime_repack.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -641,27 +651,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } - if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } - if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.model.empty()) { params.model = cmd_params_defaults.model; } + if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } + if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } + if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } + if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } + if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } + if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } + if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } + if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } + if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()) { params.no_kv_offload = cmd_params_defaults.no_kv_offload; } + if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } + if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } + if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } + if (params.runtime_repack.empty()){ params.runtime_repack = cmd_params_defaults.runtime_repack; } + if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } + if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } + if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } + if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } return params; } @@ -687,6 +698,7 @@ struct cmd_params_instance { std::vector tensor_split; bool use_mmap; bool embeddings; + bool runtime_repack; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -724,6 +736,7 @@ struct cmd_params_instance { cparams.offload_kqv = !no_kv_offload; cparams.flash_attn = flash_attn; cparams.embeddings = embeddings; + cparams.runtime_repack = runtime_repack; return cparams; } @@ -741,6 +754,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ts : params.tensor_split) for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) + for (const auto & rtrp : params.runtime_repack) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -756,26 +770,27 @@ static std::vector get_cmd_params_instances(const cmd_param continue; } cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ 0, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ 0, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload = */ nkvo, + /* .flash_attn = */ fa, + /* .tensor_split = */ ts, + /* .use_mmap = */ static_cast(mmp) && !static_cast(rtrp), + /* .embeddings = */ embd, + /* .runtime_repack= */ rtrp, }; instances.push_back(instance); } @@ -785,26 +800,27 @@ static std::vector get_cmd_params_instances(const cmd_param continue; } cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ 0, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, + /* .model = */ m, + /* .n_prompt = */ 0, + /* .n_gen = */ n_gen, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload = */ nkvo, + /* .flash_attn = */ fa, + /* .tensor_split = */ ts, + /* .use_mmap = */ static_cast(mmp) && !static_cast(rtrp), + /* .embeddings = */ embd, + /* .runtime_repack= */ rtrp, }; instances.push_back(instance); } @@ -814,26 +830,27 @@ static std::vector get_cmd_params_instances(const cmd_param continue; } cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_pg.first, - /* .n_gen = */ n_pg.second, - /* .n_batch = */ nb, - /* .n_ubatch = */ nub, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, - /* .n_gpu_layers = */ nl, - /* .rpc_servers = */ rpc, - /* .split_mode = */ sm, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .flash_attn = */ fa, - /* .tensor_split = */ ts, - /* .use_mmap = */ mmp, - /* .embeddings = */ embd, + /* .model = */ m, + /* .n_prompt = */ n_pg.first, + /* .n_gen = */ n_pg.second, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload = */ nkvo, + /* .flash_attn = */ fa, + /* .tensor_split = */ ts, + /* .use_mmap = */ static_cast(mmp) && !static_cast(rtrp), + /* .embeddings = */ embd, + /* .runtime_repack= */ rtrp, }; instances.push_back(instance); } @@ -875,6 +892,7 @@ struct test { std::vector tensor_split; bool use_mmap; bool embeddings; + bool runtime_repack; int n_prompt; int n_gen; std::string test_time; @@ -904,6 +922,7 @@ struct test { tensor_split = inst.tensor_split; use_mmap = inst.use_mmap; embeddings = inst.embeddings; + runtime_repack = inst.runtime_repack; n_prompt = inst.n_prompt; n_gen = inst.n_gen; // RFC 3339 date-time format @@ -974,7 +993,7 @@ struct test { "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", + "tensor_split", "use_mmap", "embeddings", "runtime_repack", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", @@ -996,7 +1015,7 @@ struct test { if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { + field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "runtime_repack") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1032,7 +1051,7 @@ struct test { ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), + tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(runtime_repack), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -1220,6 +1239,9 @@ struct markdown_printer : public printer { if (field == "test") { return 13; } + if (field == "runtime_repack") { + return 6; + } int width = std::max((int)field.length(), 10); @@ -1254,6 +1276,9 @@ struct markdown_printer : public printer { if (field == "tensor_split") { return "ts"; } + if (field == "runtime_repack") { + return "repack"; + } return field; } @@ -1312,6 +1337,9 @@ struct markdown_printer : public printer { if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } + if (params.runtime_repack.size() > 1 || params.runtime_repack != cmd_params_defaults.runtime_repack) { + fields.emplace_back("runtime_repack"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 5933b8e8f63ee2..4539c9c2e7a4f7 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -310,6 +310,7 @@ extern "C" { GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + GGML_API void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack); // Create a backend buffer from an existing pointer GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index b27f411474f4c4..700e66a02d253d 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -3207,3 +3207,102 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } } + +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(t->ne[0] % 8 == 0); + GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + + // Do in-place transformation. Allocate scratch buffer + size_t size = sizeof(block_q4_0x4) * t->ne[0] / QK4_0; + if (size > *psize) { + uint8_t *new_mem = realloc(*pmem, size); + if (!new_mem) { + return -1; + } + *pmem = new_mem; + *psize = size; + } + block_q4_0x4 *dst = (block_q4_0x4*) *pmem; + block_q4_0 *src = (block_q4_0*) t->data; + block_q4_0 dst_tmp[4]; + int n = t->ne[0]; + int nrow = t->ne[1]; // Number of rows + int nrows_interleaved = 4; + int nblocks = t->ne[0] / QK4_0; + for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) { + int cnt = 0; + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++ ) { + dst_tmp[i] = src[x + i * nblocks]; + } + dst[cnt++] = make_block_q4_0x4(dst_tmp, interleave_block, 0x88); + } + memcpy(src, dst, size); + src += cnt * 4; + } + return 0; +} + +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(t->ne[0] % 8 == 0); + GGML_ASSERT(interleave_block == 8); + + // Do in-place transformation. Allocate scratch buffer + size_t size = sizeof(block_q4_0x8) * t->ne[0] / QK4_0; + if (size > *psize) { + uint8_t *new_mem = realloc(*pmem, size); + if (!new_mem) { + return -1; + } + *pmem = new_mem; + *psize = size; + } + block_q4_0x8 *dst = (block_q4_0x8*) *pmem; + block_q4_0 *src = (block_q4_0*) t->data; + block_q4_0 dst_tmp[8]; + int n = t->ne[0]; + int nrow = t->ne[1]; // Number of rows + int nrows_interleaved = 8; + int nblocks = t->ne[0] / QK4_0; + for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) { + int cnt = 0; + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++ ) { + dst_tmp[i] = src[x + i * nblocks]; + } + dst[cnt++] = make_block_q4_0x8(dst_tmp, interleave_block, 0x88); + } + memcpy(src, dst, size); + src += cnt * 4; + } + return 0; +} + +// Prepare for optimized kernels if applicable +void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize) { + UNUSED(cur); + UNUSED(pmem); + UNUSED(psize); + +#if defined(__ARM_ARCH) + if (cur->type == GGML_TYPE_Q4_0) { + if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { + if (repack_q4_0_to_q4_0_8_bl(cur, 8, pmem, psize) == 0) { + cur->type = GGML_TYPE_Q4_0_8_8; + } + } + else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { + if (repack_q4_0_to_q4_0_4_bl(cur, 8, pmem, psize) == 0) { + cur->type = GGML_TYPE_Q4_0_4_8; + } + } + else if (ggml_cpu_has_neon()) { + if (repack_q4_0_to_q4_0_4_bl(cur, 4, pmem, psize) == 0) { + cur->type = GGML_TYPE_Q4_0_4_4; + } + } + } +#endif +} diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h index 517babaf1691bb..f68d66f6dd43e0 100644 --- a/ggml/src/ggml-aarch64.h +++ b/ggml/src/ggml-aarch64.h @@ -33,6 +33,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a3bc79a46b8711..4313f55e1faae0 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -11,6 +11,7 @@ #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" +#include "ggml-aarch64.h" #include #include @@ -882,6 +883,10 @@ struct ggml_backend_cpu_context { uint8_t * work_data; size_t work_size; + bool runtime_repack; + uint8_t * scratch_memory; + size_t scratch_size; + ggml_abort_callback abort_callback; void * abort_callback_data; }; @@ -895,6 +900,7 @@ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; delete[] cpu_ctx->work_data; + free(cpu_ctx->scratch_memory); // free the scratch memory allocated by C module delete cpu_ctx; delete backend; } @@ -952,6 +958,16 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + if (cpu_ctx->runtime_repack) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_MUL_MAT && node->src[0]->type == GGML_TYPE_Q4_0) { + // Prepare for optimized kernels if applicable. + ggml_prepare_optimal_kernel(node->src[0], &cpu_ctx->scratch_memory, &cpu_ctx->scratch_size); + } + } + } + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { @@ -1008,6 +1024,9 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->work_size = 0; ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; + ctx->runtime_repack = false; + ctx->scratch_memory = NULL; + ctx->scratch_size = 0; ggml_backend_t cpu_backend = new ggml_backend { /* .guid = */ ggml_backend_cpu_guid(), @@ -1055,6 +1074,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_ ctx->abort_callback_data = abort_callback_data; } +void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->runtime_repack = runtime_repack; +} + ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); diff --git a/include/llama.h b/include/llama.h index 02bc7f087c62b7..79f2a4a26ffb19 100644 --- a/include/llama.h +++ b/include/llama.h @@ -341,11 +341,12 @@ extern "C" { // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings + bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool no_perf; // whether to measure performance timings + bool runtime_repack; // runtime repack weight for optimized kernels // Abort callback // if it returns true, execution of llama_decode() will be aborted diff --git a/src/llama.cpp b/src/llama.cpp index 68479c6dba0495..ce66b7d84833d3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2574,6 +2574,7 @@ struct llama_cparams { bool offload_kqv; bool flash_attn; bool no_perf; + bool runtime_repack; enum llama_pooling_type pooling_type; @@ -17107,6 +17108,7 @@ static void llama_graph_compute( ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); + ggml_backend_cpu_set_runtime_repack(lctx.backend_cpu, lctx.cparams.runtime_repack); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } @@ -19034,6 +19036,7 @@ struct llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, + /*.runtime_repack =*/ false, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -19292,6 +19295,7 @@ struct llama_context * llama_new_context_with_model( cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; + cparams.runtime_repack = params.runtime_repack; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;