From 5c1980d8d4c4e0c0af77359f81cc44d90b3f250b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:10:34 +0200 Subject: [PATCH 001/131] server : fix build + rename enums (#4870) --- examples/server/server.cpp | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1cca634d5461f..4a0714997f17d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -147,15 +147,15 @@ static std::vector base64_decode(const std::string & encoded_string) // parallel // -enum ServerState { - LOADING_MODEL, // Server is starting up, model not fully loaded yet - READY, // Server is ready and model is loaded - ERROR // An error occurred, load_model failed +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed }; enum task_type { - COMPLETION_TASK, - CANCEL_TASK + TASK_TYPE_COMPLETION, + TASK_TYPE_CANCEL, }; struct task_server { @@ -1402,7 +1402,7 @@ struct llama_server_context task.data = std::move(data); task.infill_mode = infill; task.embedding_mode = embedding; - task.type = COMPLETION_TASK; + task.type = TASK_TYPE_COMPLETION; task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests @@ -1524,7 +1524,7 @@ struct llama_server_context std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; - task.type = CANCEL_TASK; + task.type = TASK_TYPE_CANCEL; task.target_id = task_id; queue_tasks.push_back(task); condition_tasks.notify_one(); @@ -1560,7 +1560,7 @@ struct llama_server_context queue_tasks.erase(queue_tasks.begin()); switch (task.type) { - case COMPLETION_TASK: { + case TASK_TYPE_COMPLETION: { llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { @@ -1589,7 +1589,7 @@ struct llama_server_context break; } } break; - case CANCEL_TASK: { // release slot linked with the task id + case TASK_TYPE_CANCEL: { // release slot linked with the task id for (auto & slot : slots) { if (slot.task_id == task.target_id) @@ -2798,24 +2798,24 @@ int main(int argc, char **argv) httplib::Server svr; - std::atomic server_state{LOADING_MODEL}; + std::atomic state{SERVER_STATE_LOADING_MODEL}; svr.set_default_headers({{"Server", "llama.cpp"}, {"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Headers", "content-type"}}); svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { - ServerState current_state = server_state.load(); + server_state current_state = state.load(); switch(current_state) { - case READY: + case SERVER_STATE_READY: res.set_content(R"({"status": "ok"})", "application/json"); res.status = 200; // HTTP OK break; - case LOADING_MODEL: + case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json"); res.status = 503; // HTTP Service Unavailable break; - case ERROR: + case SERVER_STATE_ERROR: res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); res.status = 500; // HTTP Internal Server Error break; @@ -2891,7 +2891,7 @@ int main(int argc, char **argv) { if (!svr.listen_after_bind()) { - server_state.store(ERROR); + state.store(SERVER_STATE_ERROR); return 1; } @@ -2901,11 +2901,11 @@ int main(int argc, char **argv) // load the model if (!llama.load_model(params)) { - server_state.store(ERROR); + state.store(SERVER_STATE_ERROR); return 1; } else { llama.initialize(); - server_state.store(READY); + state.store(SERVER_STATE_READY); } // Middleware for API key validation From 7a9f75c38b5e62fe27b8a5a3ed823b4a3714024b Mon Sep 17 00:00:00 2001 From: Behnam M <58621210+ibehnam@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:12:05 -0500 Subject: [PATCH 002/131] server : update readme to document the new `/health` endpoint (#4866) * added /health endpoint to the server * added comments on the additional /health endpoint * Better handling of server state When the model is being loaded, the server state is `LOADING_MODEL`. If model-loading fails, the server state becomes `ERROR`, otherwise it becomes `READY`. The `/health` endpoint provides more granular messages now according to the server_state value. * initialized server_state * fixed a typo * starting http server before initializing the model * Update server.cpp * Update server.cpp * fixes * fixes * fixes * made ServerState atomic and turned two-line spaces into one-line * updated `server` readme to document the `/health` endpoint too --- examples/server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index d85a14f891bc4..dc27e72b9d084 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -110,6 +110,10 @@ node index.js ``` ## API Endpoints +- **GET** `/health`: Returns the current state of the server: + - `{"status": "loading model"}` if the model is still being loaded. + - `{"status": "error"}` if the model failed to load. + - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. From f34432ca1e0b288129390c1db8296a82aaf1e632 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Fri, 5 Jan 2024 16:00:00 +0100 Subject: [PATCH 003/131] fix : cuda order of synchronization when setting a buffer (ggml/679) * fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e26260a35bcbd..900f7ba4afac4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10184,8 +10184,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { From c910e3c28a1caee8cb1398143d582dd9ab697e68 Mon Sep 17 00:00:00 2001 From: Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:16:37 -0500 Subject: [PATCH 004/131] Fix execlp call (ggml/689) NULL can be an integer constant expression with the value zero, in this case the behavior would be undefined because of an incorrect type being passed to the variable arguments. --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index adb387100780e..4a0ec4c44b209 100644 --- a/ggml.c +++ b/ggml.c @@ -132,7 +132,7 @@ void ggml_print_backtrace(void) { "-ex", "bt -frame-info source-and-location", "-ex", "detach", "-ex", "quit", - NULL); + (char *) NULL); } else { waitpid(pid, NULL, 0); } From e739de790921e6abbc8c70398303cacd74913f61 Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 10 Jan 2024 21:13:42 +0800 Subject: [PATCH 005/131] ggml : change GGML_MAX_NAME at compile time (ggml/682) * change GGML_MAX_NAME to 128 * allow controlling the value of GGML_MAX_NAME through external macro definitions --- ggml.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.h b/ggml.h index c55e598b4fea3..b6cc85952ff8d 100644 --- a/ggml.h +++ b/ggml.h @@ -218,7 +218,9 @@ #define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 +#ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 +#endif #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 From 5362e43962e84d61e20b91f34991d7ccaef4a7d5 Mon Sep 17 00:00:00 2001 From: Jack Mousseau Date: Wed, 10 Jan 2024 06:19:19 -0800 Subject: [PATCH 006/131] metal : wrap each operation in debug group (ggml/690) --- ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 6c2a8d04e5292..1619068244b49 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,6 +1067,8 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst)]]; + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2423,6 +2425,8 @@ bool ggml_metal_graph_compute( GGML_ASSERT(false); } } + + [encoder popDebugGroup]; } if (encoder != nil) { From f85a973aa139ae6f37e8b8e1966f1d278b5e0372 Mon Sep 17 00:00:00 2001 From: Timothy Cronin <40186632+4imothy@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:27:48 -0500 Subject: [PATCH 007/131] ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693) --- ggml.c | 30 ++++++++---------------------- ggml.h | 11 ----------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/ggml.c b/ggml.c index 4a0ec4c44b209..9c42a45e3d852 100644 --- a/ggml.c +++ b/ggml.c @@ -4311,13 +4311,13 @@ struct ggml_tensor * ggml_set_2d_inplace( static struct ggml_tensor * ggml_cpy_impl( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { + // inplace is false and either one have a grad is_node = true; } @@ -4341,29 +4341,21 @@ struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, false); -} - -struct ggml_tensor * ggml_cpy_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, true); + return ggml_cpy_impl(ctx, a, b); } // ggml_cont static struct ggml_tensor * ggml_cont_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { + struct ggml_tensor * a) { bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; @@ -4376,13 +4368,7 @@ static struct ggml_tensor * ggml_cont_impl( struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, false); -} - -struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, true); + return ggml_cont_impl(ctx, a); } // make contiguous, with new shape diff --git a/ggml.h b/ggml.h index b6cc85952ff8d..127dcef1dedaa 100644 --- a/ggml.h +++ b/ggml.h @@ -1163,22 +1163,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // a -> b, in-place, return view(b) - GGML_API struct ggml_tensor * ggml_cpy_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - // make contiguous GGML_API struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a); - // make contiguous, in-place - GGML_API struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, From 3267c2abc72e34608224408ace3c048831050f97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:34:59 +0200 Subject: [PATCH 008/131] metal : fix deprecation warning (ggml/690) --- ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 1619068244b49..82d68cd1bf1f1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,7 +1067,7 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst)]]; + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; From 64802ec00d6383784a9dacf616095eaced16c3c3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:39:08 +0200 Subject: [PATCH 009/131] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index fe7f3202f4bb6..3e2c579d575cd 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -f96711108d55bdbbd277e6be07204dce6a94fb93 +979cc23b345006504cfc1f67c0fdf627805e3319 From 2a7c94db5fb67b2f8882d2d16a11bf5d8d12d397 Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Thu, 11 Jan 2024 14:31:52 +0000 Subject: [PATCH 010/131] metal : put encoder debug group behind a define (#4873) --- ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 82d68cd1bf1f1..9698e5a79cc7d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,7 +1067,9 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } +#ifndef GGML_METAL_NDEBUG [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; +#endif const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; @@ -2426,7 +2428,9 @@ bool ggml_metal_graph_compute( } } +#ifndef GGML_METAL_NDEBUG [encoder popDebugGroup]; +#endif } if (encoder != nil) { From 2f043328e3116724d15b915b5c6078e2df860a69 Mon Sep 17 00:00:00 2001 From: Isaac McFadyen Date: Thu, 11 Jan 2024 09:33:26 -0500 Subject: [PATCH 011/131] server : fix typo in model name (#4876) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4a0714997f17d..860e4e9ae3d81 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2515,7 +2515,7 @@ json oaicompat_completion_params_parse( // // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("uknown")); + llama_params["model"] = json_value(body, "model", std::string("unknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); From 43f76bf1c362c067fce46bb8dcda0b64af8a9533 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Thu, 11 Jan 2024 16:14:52 +0000 Subject: [PATCH 012/131] main : print total token count and tokens consumed so far (#4874) * Token count changes * Add show token count * Updating before PR * Two requested changes * Move param def posn --- common/common.cpp | 8 ++++++++ common/common.h | 2 +- examples/main/main.cpp | 6 +++++- llama.cpp | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4e89fe516e0a9..bfcd6d4dfe5d1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -630,6 +630,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "-stc" || arg == "--show_token_count") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.token_interval = std::stoi(argv[i]); } else if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -944,6 +950,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -stc N --show_token_count N\n"); + printf(" show consumed tokens every N tokens\n"); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); diff --git a/common/common.h b/common/common.h index e2bbfc258b646..a295e88b05044 100644 --- a/common/common.h +++ b/common/common.h @@ -64,6 +64,7 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width + int32_t token_interval = 512; // show token count every 512 tokens float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor @@ -242,4 +243,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); - diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 5ea67051f3654..1f35febbd181a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -500,7 +500,7 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { - // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. int max_embd_size = n_ctx - 4; @@ -650,6 +650,10 @@ int main(int argc, char ** argv) { n_past += n_eval; LOG("n_past = %d\n", n_past); + // Display total tokens alongside total time + if (n_past % params.token_interval == 0) { + printf("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } } if (!embd.empty() && !path_session.empty()) { diff --git a/llama.cpp b/llama.cpp index e1f1932baecf1..aaadfa444637e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10921,7 +10921,7 @@ void llama_print_timings(struct llama_context * ctx) { __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); } void llama_reset_timings(struct llama_context * ctx) { From d8d90aa343c22fe01429d3540e47ded87e9dcb9d Mon Sep 17 00:00:00 2001 From: Someone Date: Thu, 11 Jan 2024 17:22:34 +0000 Subject: [PATCH 013/131] ci: nix-flake-update: new token with pr permissions (#4879) * ci: nix-flake-update: new token with pr permissions --------- Co-authored-by: Georgi Gerganov --- .github/workflows/nix-flake-update.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml index fa936084198fc..3a6a96e263e59 100644 --- a/.github/workflows/nix-flake-update.yml +++ b/.github/workflows/nix-flake-update.yml @@ -19,4 +19,4 @@ jobs: pr-labels: | nix pr-reviewers: philiptaron,SomeoneSerge - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.FLAKE_TOKEN }} From eab67950068e4b125007d027232c47d2a5831cd0 Mon Sep 17 00:00:00 2001 From: Behnam M <58621210+ibehnam@users.noreply.github.com> Date: Thu, 11 Jan 2024 12:41:39 -0500 Subject: [PATCH 014/131] server : add `LOG_INFO` when model is successfully loaded (#4881) * added /health endpoint to the server * added comments on the additional /health endpoint * Better handling of server state When the model is being loaded, the server state is `LOADING_MODEL`. If model-loading fails, the server state becomes `ERROR`, otherwise it becomes `READY`. The `/health` endpoint provides more granular messages now according to the server_state value. * initialized server_state * fixed a typo * starting http server before initializing the model * Update server.cpp * Update server.cpp * fixes * fixes * fixes * made ServerState atomic and turned two-line spaces into one-line * updated `server` readme to document the `/health` endpoint too * used LOG_INFO after successful model loading --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 860e4e9ae3d81..51a4b689f78b1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2906,6 +2906,7 @@ int main(int argc, char **argv) } else { llama.initialize(); state.store(SERVER_STATE_READY); + LOG_INFO("model loaded", {}); } // Middleware for API key validation From 27379455c38cb13f24de92dbd6fcdd04eeb1b9d9 Mon Sep 17 00:00:00 2001 From: Michael Coppola Date: Thu, 11 Jan 2024 12:51:17 -0500 Subject: [PATCH 015/131] server : support for multiple api keys (#4864) * server: added support for multiple api keys, added loading api keys from file * minor: fix whitespace * added file error handling to --api-key-file, changed code to better reflect current style * server: update README.md for --api-key-file --------- Co-authored-by: Michael Coppola --- examples/server/README.md | 3 ++- examples/server/server.cpp | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index dc27e72b9d084..fd3034b99c3d2 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,7 +23,8 @@ Command line options: - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. - `--path`: path from which to serve static files (default examples/server/public) -- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. +- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. +- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. - `--embedding`: Enable embedding extraction, Default: disabled. - `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 51a4b689f78b1..345004fa15f5b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -39,7 +39,7 @@ using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; - std::string api_key; + std::vector api_keys; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -2021,6 +2021,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); + printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); @@ -2081,7 +2082,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - sparams.api_key = argv[i]; + sparams.api_keys.push_back(argv[i]); + } + else if (arg == "--api-key-file") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::ifstream key_file(argv[i]); + if (!key_file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string key; + while (std::getline(key_file, key)) { + if (key.size() > 0) { + sparams.api_keys.push_back(key); + } + } + key_file.close(); } else if (arg == "--timeout" || arg == "-to") { @@ -2881,8 +2903,10 @@ int main(int argc, char **argv) log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); - if (!sparams.api_key.empty()) { - log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4); + if (sparams.api_keys.size() == 1) { + log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4); + } else if (sparams.api_keys.size() > 1) { + log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } LOG_INFO("HTTP server listening", log_data); @@ -2912,7 +2936,7 @@ int main(int argc, char **argv) // Middleware for API key validation auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { // If API key is not set, skip validation - if (sparams.api_key.empty()) { + if (sparams.api_keys.empty()) { return true; } @@ -2921,7 +2945,7 @@ int main(int argc, char **argv) std::string prefix = "Bearer "; if (auth_header.substr(0, prefix.size()) == prefix) { std::string received_api_key = auth_header.substr(prefix.size()); - if (received_api_key == sparams.api_key) { + if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) { return true; // API key is valid } } From 4330bd83feb39683de4bd7a34cfcf672ff8ac3e4 Mon Sep 17 00:00:00 2001 From: Laura Date: Thu, 11 Jan 2024 19:02:48 +0100 Subject: [PATCH 016/131] server : implement credentialed CORS (#4514) * Implement credentialed CORS according to MDN * Fix syntax error * Move validate_api_key up so it is defined before its first usage --- examples/server/server.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 345004fa15f5b..031824e145411 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2822,9 +2822,15 @@ int main(int argc, char **argv) std::atomic state{SERVER_STATE_LOADING_MODEL}; - svr.set_default_headers({{"Server", "llama.cpp"}, - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"}}); + svr.set_default_headers({{"Server", "llama.cpp"}}); + + // CORS preflight + svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + }); svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { server_state current_state = state.load(); @@ -2987,9 +2993,9 @@ int main(int argc, char **argv) return false; }); - svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res) + svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res) { - res.set_header("Access-Control-Allow-Origin", "*"); + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() } @@ -2999,6 +3005,7 @@ int main(int argc, char **argv) svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -3066,8 +3073,9 @@ int main(int argc, char **argv) } }); - svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res) + svr.Get("/v1/models", [¶ms](const httplib::Request& req, httplib::Response& res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); std::time_t t = std::time(0); json models = { @@ -3085,9 +3093,11 @@ int main(int argc, char **argv) res.set_content(models.dump(), "application/json; charset=utf-8"); }); + // TODO: add mount point without "/v1" prefix -- how? svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -3161,6 +3171,7 @@ int main(int argc, char **argv) svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -3233,6 +3244,7 @@ int main(int argc, char **argv) svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::vector tokens; if (body.count("content") != 0) @@ -3245,6 +3257,7 @@ int main(int argc, char **argv) svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::string content; if (body.count("tokens") != 0) @@ -3259,6 +3272,7 @@ int main(int argc, char **argv) svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); json prompt; if (body.count("content") != 0) From 3ba5b8ca8e6181a5c712c5b77595a29f1d3e2b97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 21:31:31 +0200 Subject: [PATCH 017/131] swift : pin ggml commit + remove ggml.h from spm-headers (#4878) ggml-ci --- Package.swift | 2 +- spm-headers/ggml.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 spm-headers/ggml.h diff --git a/Package.swift b/Package.swift index 583e2e276e471..59191da45c233 100644 --- a/Package.swift +++ b/Package.swift @@ -14,7 +14,7 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .branch("master")) + .package(url: "https://github.com/ggerganov/ggml.git", .revision("979cc23b345006504cfc1f67c0fdf627805e3319")) ], targets: [ .target( diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h deleted file mode 120000 index 39215298f981b..0000000000000 --- a/spm-headers/ggml.h +++ /dev/null @@ -1 +0,0 @@ -../ggml.h \ No newline at end of file From 49662cbed3e95f5976c070b85b9fd53fd577038d Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 11 Jan 2024 20:39:39 +0100 Subject: [PATCH 018/131] ggml : SOTA 2-bit quants (add IQ2_XS) (#4856) * iq2_xs: basics * iq2_xs: this should have been in the basics * iq2_xs: CUDA and scalar CPU works * iq2_xs: WIP Metal * iq2_xs: Metal now works * iq2_xs: working, but dog slow, ARM_NEON dot product * iq2_xs: better ARM_NEON dot product We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when running on the CPU. * iq2_xs: AVX2 dot product - 19.5 t/s * iq2_xs: faster AVX2 dit product 21.4 t/s for TG-128, 59.2 t/s for PP-512. The latter is 2x compared to the previous version. * iq2_xs: had forgotten to delete iq2-data.h * Add llama enum for IQ2_XS --------- Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 232 +++++++++++++++++++++- ggml-metal.m | 42 +++- ggml-metal.metal | 378 +++++++++++++++++++++++++++++++++++- ggml-quants.c | 360 +++++++++++++++++++++++++++++++++- ggml-quants.h | 12 ++ ggml.c | 30 ++- ggml.h | 3 + llama.cpp | 3 + llama.h | 1 + tests/test-quantize-fns.cpp | 5 +- 10 files changed, 1038 insertions(+), 28 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 900f7ba4afac4..dd19699f6669c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -486,6 +486,15 @@ typedef struct { } block_iq2_xxs; static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); +#define QR2_XS 8 +#define QI2_XS (QK_K / (4*QR2_XS)) +typedef struct { + half d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -1328,7 +1337,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t #endif } -static const __device__ uint64_t kgrid_iq2xxs[256] = { +static const __device__ uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -1395,6 +1404,137 @@ static const __device__ uint64_t kgrid_iq2xxs[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +static const __device__ uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + static const __device__ uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -1439,7 +1579,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds dst_t * y = yy + i*QK_K + 32*ib + 8*il; const uint16_t * q2 = x[i].qs + 4*ib; const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(kgrid_iq2xxs + aux8[il]); + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]); const uint32_t aux32 = q2[2] | (q2[3] << 16); const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; @@ -1450,6 +1590,28 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds } +template +static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -3996,7 +4158,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( uint32_t aux32 = q2[2] | (q2[3] << 16); int sumi = 0; for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(kgrid_iq2xxs + aux8[l]); + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); const uint8_t signs = ksigns_iq2xs[aux32 & 127]; for (int j = 0; j < 8; ++j) { sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); @@ -4012,8 +4174,8 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const int il = iqs%2; const uint16_t * q2 = bq2->qs + 4*ib32; const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(kgrid_iq2xxs + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(kgrid_iq2xxs + aux8[2*il+1]); + const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); const uint32_t aux32 = q2[2] | (q2[3] << 16); const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f; const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; @@ -4032,6 +4194,42 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( #endif } +static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); + return 0.f; +#endif +} + template static __device__ __forceinline__ void mul_mat_q( @@ -6035,6 +6233,12 @@ static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq2_xxs<<>>(vx, y); } +template +static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_xs<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6065,6 +6269,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_q6_K_cuda; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6096,6 +6302,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_q6_K_cuda; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -6299,6 +6507,15 @@ static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, floa <<>>(vx, vy, dst, ncols, nrows); } +static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { @@ -7871,6 +8088,7 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: return max_compute_capability >= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -7892,6 +8110,7 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: return max_compute_capability >= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -7945,6 +8164,9 @@ static void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_XXS: mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; diff --git a/ggml-metal.m b/ggml-metal.m index 9698e5a79cc7d..6e5594432b21a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -89,6 +89,7 @@ GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(get_rows_i32); GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs); + GGML_METAL_DECL_KERNEL(get_rows_iq2_xs); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(group_norm); GGML_METAL_DECL_KERNEL(norm); @@ -108,6 +109,7 @@ GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16); GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32); @@ -124,6 +126,7 @@ GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); @@ -137,6 +140,7 @@ GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); @@ -150,6 +154,7 @@ GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32); GGML_METAL_DECL_KERNEL(rope_f32); GGML_METAL_DECL_KERNEL(rope_f16); GGML_METAL_DECL_KERNEL(alibi_f32); @@ -385,6 +390,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(get_rows_i32); GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs); + GGML_METAL_ADD_KERNEL(get_rows_iq2_xs); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(group_norm); GGML_METAL_ADD_KERNEL(norm); @@ -404,6 +410,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16); GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32); @@ -420,6 +427,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); @@ -434,6 +442,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); @@ -447,6 +456,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32); } GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f16); @@ -513,6 +523,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(get_rows_i32); GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs); + GGML_METAL_DEL_KERNEL(get_rows_iq2_xs); GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(group_norm); GGML_METAL_DEL_KERNEL(norm); @@ -532,6 +543,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16); GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32); @@ -548,6 +560,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); @@ -562,6 +575,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); @@ -575,6 +589,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32); } GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f16); @@ -1561,6 +1576,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1679,6 +1695,12 @@ bool ggml_metal_graph_compute( nth1 = 16; [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32]; } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32]; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1712,12 +1734,12 @@ bool ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - //src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ2_XXS) { - [encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0]; + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q4_K) { @@ -1810,6 +1832,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1931,6 +1954,12 @@ bool ggml_metal_graph_compute( nth1 = 16; [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32]; } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32]; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1980,12 +2009,12 @@ bool ggml_metal_graph_compute( if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - //src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ2_XXS) { - [encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0]; + else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { + const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src2t == GGML_TYPE_Q4_K) { @@ -2026,6 +2055,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index 229efb8b69db1..029578dc54dbd 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2452,6 +2452,13 @@ typedef struct { } block_iq2_xxs; // 66 bytes / block for QK_K = 256, so 2.0625 bpw +typedef struct { + half d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +// 74 bytes / block for QK_K = 256, so 2.3125 bpw + //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -3476,7 +3483,7 @@ kernel void kernel_mul_mv_q6_K_f32( // ======================= "True" 2-bit -constexpr constant static uint64_t kgrid_iq2xxs[256] = { +constexpr constant static uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -3543,6 +3550,137 @@ constexpr constant static uint64_t kgrid_iq2xxs[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +constexpr constant static uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + constexpr constant static uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3600,7 +3738,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl( { int nval = 4; int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) values[pos + i] = kgrid_iq2xxs[pos + i]; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i]; nval = 2; pos = (32*sgitg + tiisg)*nval; for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; @@ -3689,6 +3827,149 @@ kernel void kernel_mul_mv_iq2_xxs_f32( kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq2_xs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 512); + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + +#if QK_K == 256 + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const uint8_t * sc = xr->scales + ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const uint8_t ls1 = sc[0] & 0xf; + const uint8_t ls2 = sc[0] >> 4; + const float d1 = db * (0.5f + ls1); + const float d2 = db * (0.5f + ls2); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < 2; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + for (int l = 2; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d1 * sum1 + d2 * sum2; + + dh += nb*sizeof(block_iq2_xs)/2; + q2 += nb*sizeof(block_iq2_xs)/2; + sc += nb*sizeof(block_iq2_xs); + } + + y4 += 32 * 32; + } +#else + // TODO +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xs_f32")]] +kernel void kernel_mul_mv_iq2_xs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + //============================= templates and their specializations ============================= // NOTE: this is not dequantizing - we are simply fitting the template @@ -3973,18 +4254,39 @@ void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x const uint32_t aux32_s = q2[2] | (q2[3] << 16); thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(kgrid_iq2xxs + aux8[2*il+0]); + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; for (int i = 0; i < 8; ++i) { reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); } - grid = (constant uint8_t *)(kgrid_iq2xxs + aux8[2*il+1]); + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; for (int i = 0; i < 8; ++i) { reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); } } +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -4525,6 +4827,7 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -4562,6 +4865,7 @@ template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -4611,6 +4915,7 @@ template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -5448,3 +5753,68 @@ kernel void kernel_mul_mv_id_iq2_xxs_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq2_xs_f32")]] +kernel void kernel_mul_mv_id_iq2_xs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq2_xs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-quants.c b/ggml-quants.c index d497e6de9ceb5..a24b4b2441e02 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2342,15 +2342,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * // ====================== "True" 2-bit (de)-quantization -void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) { - (void)x; - (void)y; - (void)k; - assert(k % QK_K == 0); - //fprintf(stderr, "=========================== %s: not implemented\n", __func__); -} - -static const uint64_t iq2xxs_grid[256] = { +static const uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -2417,6 +2409,137 @@ static const uint64_t iq2xxs_grid[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +static const uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + static const uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -2427,8 +2550,17 @@ static const uint8_t ksigns_iq2xs[128] = { 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, }; + static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; +void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) { + (void)x; + (void)y; + (void)k; + assert(k % QK_K == 0); + //fprintf(stderr, "=========================== %s: not implemented\n", __func__); +} + void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2472,6 +2604,58 @@ size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_ return (n/QK_K*sizeof(block_iq2_xxs)); } +// ====================== 2.3125 bpw (de)-quantization + +void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) { + (void)x; + (void)y; + (void)k; + assert(k % QK_K == 0); + //fprintf(stderr, "=========================== %s: not implemented\n", __func__); +} + +void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float db[2]; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f; + db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511)); + const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9]; + for (int j = 0; j < 8; ++j) { + y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + y += 8; + } + } + } +} + +void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq2_xs * restrict y = vy; + quantize_row_iq2_xs_reference(x, y, k); +} + +size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK_K == 0); + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K; + quantize_row_iq2_xs_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_iq2_xs)); +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -7357,3 +7541,161 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res *s = 0.125f * sumf; #endif } + +void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq2_xs * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + int8x16x4_t q2u; + int8x16x4_t q2s; + int8x16x4_t q8b; + + int32x4x4_t scales32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const uint8x8_t scales8 = vld1_u8(x[i].scales); + const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); + const uint8x8_t scales_h = vshr_n_u8(scales8, 4); + uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1)); + const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales)); + const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales)); + scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1))); + scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1))); + scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2))); + scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); + int32x4_t sumi = vdupq_n_s32(0); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + q8b = vld1q_s8_x4(q8); q8 += 64; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511)))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]); + const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]); + const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]); + const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]); + const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4)); + sumi = vmlaq_s32(sumi, p, scales32.val[ib64]); + q2 += 8; + } + sumf += d*vaddvq_s32(sumi); + } + *s = 0.125f * sumf; + +#elif defined(__AVX2__) + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + const __m128i m511 = _mm_set1_epi16(511); + const __m128i m127 = _mm_set1_epi16(127); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m128i aux_gindex, aux_sindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + const uint16_t * sindex = (const uint16_t *)&aux_sindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8; + aux_gindex = _mm_and_si128(q2_data, m511); + aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127); + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); + const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + + const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} diff --git a/ggml-quants.h b/ggml-quants.h index 8dd911d4182fa..df5e7ae807f5f 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -174,6 +174,14 @@ typedef struct { } block_iq2_xxs; static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); +// 2.3125 bpw quants +typedef struct { + ggml_fp16_t d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); + // Quantization void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); @@ -189,6 +197,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k); +void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k); void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); @@ -204,6 +213,7 @@ void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k); +void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); @@ -220,6 +230,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -234,3 +245,4 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); diff --git a/ggml.c b/ggml.c index 9c42a45e3d852..d2a8c0478ab72 100644 --- a/ggml.c +++ b/ggml.c @@ -584,6 +584,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_IQ2_XS] = { + .type_name = "iq2_xs", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_xs), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, + .from_float = quantize_row_iq2_xs, + .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference, + .vec_dot = ggml_vec_dot_iq2_xs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2123,6 +2134,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break; case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; + case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7435,6 +7447,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7700,6 +7713,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -7815,6 +7829,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: default: { GGML_ASSERT(false); @@ -10457,6 +10472,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -10632,6 +10648,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: default: { GGML_ASSERT(false); @@ -10827,6 +10844,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11464,6 +11482,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11539,6 +11558,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -18660,6 +18680,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K; result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist); } break; + case GGML_TYPE_IQ2_XS: + { + GGML_ASSERT(start % QK_K == 0); + block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K; + result = ggml_quantize_iq2_xs(src + start, block, n, n, hist); + } break; case GGML_TYPE_F16: { int elemsize = sizeof(ggml_fp16_t); @@ -19015,8 +19041,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[3]; if (ne % ggml_blck_size(info->type) != 0) { - fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", - __func__, info->name.data, ne, ggml_blck_size(info->type)); + fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); gguf_free(ctx); return NULL; diff --git a/ggml.h b/ggml.h index 127dcef1dedaa..93b42a27da53d 100644 --- a/ggml.h +++ b/ggml.h @@ -342,6 +342,7 @@ extern "C" { GGML_TYPE_Q6_K = 14, GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -377,6 +378,7 @@ extern "C" { GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors }; // available tensor operations: @@ -2061,6 +2063,7 @@ extern "C" { GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); diff --git a/llama.cpp b/llama.cpp index aaadfa444637e..bd219d49c7ac7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2223,6 +2223,7 @@ struct llama_model_loader { case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; + case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2595,6 +2596,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; default: return "unknown, may not work"; } @@ -9050,6 +9052,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index c11075bbcd693..6fde113ffd458 100644 --- a/llama.h +++ b/llama.h @@ -104,6 +104,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index cee712618be3d..31a78c6323134 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -134,8 +134,9 @@ int main(int argc, char * argv[]) { continue; } - if ((ggml_type)i == GGML_TYPE_IQ2_XXS) { - printf("Skip %s due to missing quantization functionality\n", ggml_type_name((ggml_type) i)); + const ggml_type ei = (ggml_type)i; + if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) { + printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei)); continue; } From 469e75d0a35b08de549a4fd87f082ca7a8a539ba Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 11 Jan 2024 20:43:15 +0100 Subject: [PATCH 019/131] llama : restore intended k-quants mixes for MoE models (#4872) * Restore intended k-quants quantization mixes for MoE models * Update Q2_K_S values in the quantize tool Still using LLaMA-v1 PPL values in the quant description today does not make much sense. But let's leave this update for another PR. --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- examples/quantize/quantize.cpp | 1 + llama.cpp | 24 +++++++++++++++--------- llama.h | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index d27ea5e9132fd..f878f6911420a 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -18,6 +18,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, diff --git a/llama.cpp b/llama.cpp index bd219d49c7ac7..d39ff94c7fae6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; @@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - } else if (name.find("ffn_down.weight") != std::string::npos) { + } else if (name.find("ffn_down") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { - new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K : use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) { new_type = GGML_TYPE_Q5_K; } ++qs.i_feed_forward_w2; @@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } - else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - } + // IK: let's remove this, else Q2_K is almost the same as Q3_K_S + //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + //} // This can be used to reduce the size of the Q5_K_S model. // The associated PPL increase is fully in line with the size reduction //else { @@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; } - else if (name.find("ffn_down.weight") != std::string::npos) { + else if (name.find("ffn_down") != std::string::npos) { ++qs.n_feed_forward_w2; } } diff --git a/llama.h b/llama.h index 6fde113ffd458..43d41b8f642b5 100644 --- a/llama.h +++ b/llama.h @@ -105,6 +105,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; From b0377875488b33f7114138687d828da1de61775d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 21:58:28 +0200 Subject: [PATCH 020/131] swift : track ggml release branch (#4867) --- Package.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Package.swift b/Package.swift index 59191da45c233..37524edee8cd4 100644 --- a/Package.swift +++ b/Package.swift @@ -14,7 +14,7 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .revision("979cc23b345006504cfc1f67c0fdf627805e3319")) + .package(url: "https://github.com/ggerganov/ggml.git", .branch("release")) ], targets: [ .target( From 3ca63b4538dfc78aaec88cd2c3e3f8417c1924e3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 22:43:05 +0200 Subject: [PATCH 021/131] main : disable token count by default (#4874) --- common/common.cpp | 6 +++--- common/common.h | 2 +- examples/main/main.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index bfcd6d4dfe5d1..287e8bd5ad5fd 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -630,7 +630,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-stc" || arg == "--show_token_count") { + } else if (arg == "-stc" || arg == "--show-token-count") { if (++i >= argc) { invalid_param = true; break; @@ -950,8 +950,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -stc N --show_token_count N\n"); - printf(" show consumed tokens every N tokens\n"); + printf(" -stc N --show-token-count N\n"); + printf(" show consumed tokens every N tokens (default: %d)\n", params.token_interval); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); diff --git a/common/common.h b/common/common.h index a295e88b05044..82d23cf545da4 100644 --- a/common/common.h +++ b/common/common.h @@ -64,7 +64,7 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width - int32_t token_interval = 512; // show token count every 512 tokens + int32_t token_interval = -1; // show token count every 512 tokens (-1 = disabled) float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1f35febbd181a..6953d107c03ff 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -651,8 +651,8 @@ int main(int argc, char ** argv) { LOG("n_past = %d\n", n_past); // Display total tokens alongside total time - if (n_past % params.token_interval == 0) { - printf("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + if (params.token_interval > 0 && n_past % params.token_interval == 0) { + LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } From 7edefbd79cc6dea96640edc54c6b94b2b2496d8b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 22:46:26 +0200 Subject: [PATCH 022/131] main : better name for variable n_print (#4874) --- common/common.cpp | 8 ++++---- common/common.h | 2 +- examples/main/main.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 287e8bd5ad5fd..b2cb0e257a817 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -630,12 +630,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-stc" || arg == "--show-token-count") { + } else if (arg == "-ptc" || arg == "--print-token-count") { if (++i >= argc) { invalid_param = true; break; } - params.token_interval = std::stoi(argv[i]); + params.n_print = std::stoi(argv[i]); } else if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -950,8 +950,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -stc N --show-token-count N\n"); - printf(" show consumed tokens every N tokens (default: %d)\n", params.token_interval); + printf(" -stc N --print-token-count N\n"); + printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); diff --git a/common/common.h b/common/common.h index 82d23cf545da4..1359e76ab4648 100644 --- a/common/common.h +++ b/common/common.h @@ -64,7 +64,7 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width - int32_t token_interval = -1; // show token count every 512 tokens (-1 = disabled) + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6953d107c03ff..c53b29978657c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -651,7 +651,7 @@ int main(int argc, char ** argv) { LOG("n_past = %d\n", n_past); // Display total tokens alongside total time - if (params.token_interval > 0 && n_past % params.token_interval == 0) { + if (params.n_print > 0 && n_past % params.n_print == 0) { LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } From 1d118386fea031f01550f8cd47a5c86296e5333f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 23:23:49 +0200 Subject: [PATCH 023/131] server : fix infill when prompt is empty (#4833) --- examples/server/server.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 031824e145411..1d30a15a6cc1e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1406,7 +1406,7 @@ struct llama_server_context task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests - if (task.data.at("prompt").size() > 1) + if (task.data.count("prompt") && task.data.at("prompt").size() > 1) { lock.unlock(); // entering new func scope return split_multiprompt_task(task); @@ -1577,9 +1577,9 @@ struct llama_server_context slot->reset(); - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; slot->multitask_id = task.multitask_id; if (!launch_slot_with_data(slot, task.data)) @@ -1731,7 +1731,8 @@ struct llama_server_context const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + // note: infill mode allows empty prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill) { slot.release(); slot.print_timings(); @@ -2609,8 +2610,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, {"usage", json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, {"id", gen_chatcmplid()}}; if (server_verbose) { From 326b418b59b6d48d854c4461a2303e8ac0a311e6 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 12 Jan 2024 06:59:57 +0100 Subject: [PATCH 024/131] Importance Matrix calculation (#4861) * imatrix: 1st version * imatrix: WIP * Cleanup * Update examples/imatrix/imatrix.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- Makefile | 5 +- examples/CMakeLists.txt | 1 + examples/imatrix/CMakeLists.txt | 5 + examples/imatrix/imatrix.cpp | 380 ++++++++++++++++++++++++++++++++ ggml.c | 14 ++ ggml.h | 6 + 6 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 examples/imatrix/CMakeLists.txt create mode 100644 examples/imatrix/imatrix.cpp diff --git a/Makefile b/Makefile index 4c7e175bf6cb3..05fe9a0f6a0d2 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o @@ -614,6 +614,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml. perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0c71cbdf72a65..fa127a3aa7c9e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -36,6 +36,7 @@ else() add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(train-text-from-scratch) + add_subdirectory(imatrix) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt new file mode 100644 index 0000000000000..d688a16209049 --- /dev/null +++ b/examples/imatrix/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET imatrix) +add_executable(${TARGET} imatrix.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp new file mode 100644 index 0000000000000..1461bc96376a7 --- /dev/null +++ b/examples/imatrix/imatrix.cpp @@ -0,0 +1,380 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct Stats { + std::vector values; + int ncall = 0; +}; + +struct StatParams { + std::string ofile = "imatrix.dat"; + int n_output_frequency = 10; + int verbosity = 1; + bool collect_output_weight = false; +}; + +class IMatrixCollector { +public: + IMatrixCollector() = default; + void set_parameters(StatParams&& params) { m_params = std::move(params); } + void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + void save_imatrix() const; +private: + std::unordered_map m_stats; + StatParams m_params; + std::mutex m_mutex; + int m_last_call = 0; +}; + +void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return; + std::lock_guard lock(m_mutex); + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = (const float *)src1->data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + } +} + +void IMatrixCollector::save_imatrix() const { + const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(); + std::ofstream out(fname, std::ios::binary); + int n_entries = m_stats.size(); + out.write((const char*)&n_entries, sizeof(n_entries)); + for (auto& p : m_stats) { + int len = p.first.size(); + out.write((const char*)&len, sizeof(len)); + out.write(p.first.c_str(), len); + out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + int nval = p.second.values.size(); + out.write((const char*)&nval, sizeof(nval)); + if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + } + if (m_params.verbosity > 0) { + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + } +} + +static IMatrixCollector g_collector; + +static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { + g_collector.collect_imatrix(src0, src1); +} + + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) { + max_logit = std::max(max_logit, v); + } + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } + return probs; +} + +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + +static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + + auto tim1 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, + n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return false; + } + + std::vector logit_history; + logit_history.resize(tokens.size()); + + std::vector prob_history; + prob_history.resize(tokens.size()); + + const int n_chunk_max = tokens.size() / n_ctx; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + double nll2 = 0.0; + + fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + // clear the KV cache + llama_kv_cache_clear(ctx); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + } + + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + // restore the original token in case it was set to BOS + tokens[batch_start] = token_org; + + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + const int first = n_ctx/2; + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; + + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); + + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } + + return true; +} + +int main(int argc, char ** argv) { + + StatParams sparams; + std::vector args; + args.push_back(argv[0]); + int iarg = 1; + for (; iarg < argc-1; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-o" || arg == "--output-file") { + sparams.ofile = argv[++iarg]; + } + else if (arg == "-ofreq" || arg == "--output-frequency") { + sparams.n_output_frequency = std::stoi(argv[++iarg]); + } + else if (arg == "-ow" || arg == "--output-weight") { + sparams.collect_output_weight = std::stoi(argv[++iarg]); + } + else if (arg == "--verbosity") { + sparams.verbosity = std::stoi(argv[++iarg]); + } else { + args.push_back(argv[iarg]); + } + } + if (iarg < argc) { + args.push_back(argv[iarg]); + } + + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + g_collector.set_parameters(std::move(sparams)); + + ggml_set_imatrix_collection(ik_collect_imatrix); + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + + // load the model and apply lora adapter, if any + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = compute_imatrix(ctx, params); + if (!OK) { + return 1; + } + + g_collector.save_imatrix(); + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/ggml.c b/ggml.c index d2a8c0478ab72..f5caeba082ea9 100644 --- a/ggml.c +++ b/ggml.c @@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +ggml_collect_imatrix_t g_imatrix_collect = NULL; + +void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) { + g_imatrix_collect = imatrix_collect; +} + static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -9763,6 +9769,10 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0, src1); + } + const enum ggml_type type = src0->type; const bool src1_cont = ggml_is_contiguous(src1); @@ -10066,6 +10076,10 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0_cur, src1); + } + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); diff --git a/ggml.h b/ggml.h index 93b42a27da53d..4c2ff6c661ea3 100644 --- a/ggml.h +++ b/ggml.h @@ -2067,6 +2067,12 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // Importance matrix + // + typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect); + // // gguf // From f445c0e68cf8e1faca0b2aa8dfb9d48231cec301 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 13:01:56 +0200 Subject: [PATCH 025/131] llama : fix llm_build_k_shift to use correct n_rot (#4889) * llama : fix llm_build_k_shift to use correct n_rot ggml-ci * llama : always use hparams.n_rot for ggml_rope_custom ggml-ci * convert : fix persimmon conversion to write correct n_rot --- common/common.cpp | 3 ++ convert-hf-to-gguf.py | 9 ++++- gguf-py/gguf/tensor_mapping.py | 7 ++++ llama.cpp | 65 +++++++++++++++++----------------- 4 files changed, 51 insertions(+), 33 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b2cb0e257a817..3aefed01d3049 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1055,6 +1055,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & } static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return GGML_TYPE_F32; + } if (s == "f16") { return GGML_TYPE_F16; } diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 203eaf64b3fc3..813aeeed680f8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -817,10 +817,17 @@ def set_gguf_parameters(self): hidden_size = self.hparams["hidden_size"] self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + + # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller + # than the head size? + # ref: https://github.com/ggerganov/llama.cpp/pull/4889 + #self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) + self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 80c1d5449cc74..24a0890378496 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -57,6 +57,7 @@ class TensorNameMap: "transformer.norm_f", # mpt "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon + "model.final_layernorm", # persimmon "lm_head.ln", # phi2 ), @@ -98,6 +99,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "model.layers.{bid}.self_attn.query_key_value", # persimmon "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 ), @@ -141,6 +143,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon "h.{bid}.attn.c_proj", # gpt2 "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo @@ -184,6 +187,7 @@ class TensorNameMap: "encoder.layer.{bid}.intermediate.dense", # bert "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 @@ -225,6 +229,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon "h.{bid}.mlp.c_proj", # gpt2 "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo @@ -237,10 +242,12 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", + "model.layers.{bid}.self_attn.q_layernorm", # persimmon ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", + "model.layers.{bid}.self_attn.k_layernorm", # persimmon ), MODEL_TENSOR.ROPE_FREQS: ( diff --git a/llama.cpp b/llama.cpp index d39ff94c7fae6..0bab95563a226 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4104,7 +4104,6 @@ static void llm_build_k_shift( struct ggml_cgraph * graph, llm_rope_type type, int64_t n_ctx, - int n_rot, float freq_base, float freq_scale, const llm_build_cb & cb) { @@ -4112,14 +4111,13 @@ static void llm_build_k_shift( const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int32_t n_rot = hparams.n_rot; const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const float ext_factor = cparams.yarn_ext_factor; const float attn_factor = cparams.yarn_attn_factor; const float beta_fast = cparams.yarn_beta_fast; const float beta_slow = cparams.yarn_beta_slow; - GGML_ASSERT(n_embd_head_k % n_rot == 0); - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); cb(K_shift, "K_shift", -1); @@ -4523,7 +4521,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4561,14 +4559,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -4691,6 +4689,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4708,7 +4707,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4734,12 +4733,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -4812,6 +4811,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4829,7 +4829,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4870,13 +4870,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5033,9 +5033,8 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - const int64_t n_rot = n_embd_head_k / 2; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head/2 == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5052,7 +5051,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5112,7 +5111,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -5120,7 +5119,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -5129,29 +5128,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * n_rot + ggml_element_size(tmpq) * hparams.n_rot ); cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * n_rot + ggml_element_size(tmpk) * hparams.n_rot ); cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5531,6 +5530,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5548,7 +5548,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5661,7 +5661,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5693,13 +5693,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5778,7 +5778,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5874,6 +5874,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5891,7 +5892,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5917,13 +5918,13 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); From 2d00741e12c5db4a33dfccd1125f5de4adec9a5b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 13:03:38 +0200 Subject: [PATCH 026/131] py : fix lint (#4889) --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 813aeeed680f8..a1c79fd478c22 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -825,7 +825,7 @@ def set_gguf_parameters(self): # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller # than the head size? # ref: https://github.com/ggerganov/llama.cpp/pull/4889 - #self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) self.gguf_writer.add_head_count(head_count) From 4315a94366708828f949f9db89d2a8d99b634459 Mon Sep 17 00:00:00 2001 From: howlger Date: Fri, 12 Jan 2024 12:05:32 +0100 Subject: [PATCH 027/131] common : streamline the formatting of help (#4890) * common : streamline the formatting of help - Separate alternative parameters by a comma - Do not indent `--version` differently * Update common/common.cpp --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 3aefed01d3049..062a8b4deb7d1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -818,7 +818,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); + printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -915,7 +915,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" number of layers to store in VRAM\n"); printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); #ifdef GGML_USE_CUBLAS @@ -950,7 +950,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -stc N --print-token-count N\n"); + printf(" -ptc N, --print-token-count N\n"); printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); #ifndef LOG_DISABLE_LOGS From 3cabe80630c7eeb57713cd02249053a8cf6894fa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 13:10:19 +0200 Subject: [PATCH 028/131] llama : fix typo "imp_embd" -> "inp_embd" --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 0bab95563a226..29f8873f629e5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5040,7 +5040,7 @@ struct llm_build_context { struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); - cb(inpL, "imp_embd", -1); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); From 1b280c9fffd682b6924010a4437f0275f2921fa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 12 Jan 2024 12:30:41 +0100 Subject: [PATCH 029/131] CUDA: fix softmax compile for old CUDA versions (#4862) --- ggml-cuda.cu | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dd19699f6669c..a345b0c4a70ac 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -116,6 +116,8 @@ #include "ggml.h" #include "ggml-backend-impl.h" +#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) + #define CC_PASCAL 600 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 @@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { } static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) - (void) a; - bad_arch(); -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); } return a; -#endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#else + (void) a; + bad_arch(); +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } static __device__ __forceinline__ float warp_reduce_max(float x) { @@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { } static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) - (void) x; - bad_arch(); -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); } return x; -#endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#else + (void) x; + bad_arch(); +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } static __device__ __forceinline__ float op_repeat(const float a, const float b) { @@ -5613,7 +5615,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int template static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template; const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2; @@ -5738,7 +5740,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds #else (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; bad_arch(); -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } template @@ -8574,15 +8576,15 @@ static void ggml_cuda_op_soft_max( float scale = 1.0f; memcpy(&scale, dst->op_params, sizeof(float)); -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - const bool use_f16_soft_max = false; -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX #ifdef GGML_CUDA_F16 const bool use_f16_soft_max = true; #else const bool use_f16_soft_max = false; #endif // GGML_CUDA_F16 -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#else + const bool use_f16_soft_max = false; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX if (use_f16_soft_max) { soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); From 5537d9d36bfdb4379555431f574d3d78ce6e7955 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 14:33:21 +0200 Subject: [PATCH 030/131] gitignore : imatrix --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cf1b692e9c27c..fba207045344c 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ models-mnt /embedding /gguf /gguf-llama-simple +/imatrix /infill /libllama.so /llama-bench From e790eef21ce659f5c16d59f8a5c8dcf6cde0692a Mon Sep 17 00:00:00 2001 From: Zay <95888118+isaiahbjork@users.noreply.github.com> Date: Fri, 12 Jan 2024 05:48:00 -0700 Subject: [PATCH 031/131] llama.swiftui : update models layout (#4826) * Updated Models Layout - Added a models drawer - Added downloading directly from Hugging Face - Load custom models from local folder - Delete models by swiping left * trimmed trailing white space * Updated Models Layout --- .../llama.swiftui.xcodeproj/project.pbxproj | 8 +- .../llama.swiftui/Models/LlamaState.swift | 89 ++++++++ .../llama.swiftui/UI/ContentView.swift | 203 +++++++++--------- .../llama.swiftui/UI/DownloadButton.swift | 2 + .../llama.swiftui/UI/InputButton.swift | 131 +++++++++++ 5 files changed, 333 insertions(+), 100 deletions(-) create mode 100644 examples/llama.swiftui/llama.swiftui/UI/InputButton.swift diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index a8848a49fce6d..3950b9e9df843 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -8,6 +8,7 @@ /* Begin PBXBuildFile section */ 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; + 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; @@ -22,6 +23,7 @@ /* Begin PBXFileReference section */ 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; + 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = ""; }; 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = ""; }; 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; @@ -119,6 +121,7 @@ 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */, 8A1C83782AC328BD0096AF73 /* ContentView.swift */, F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */, + 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */, ); path = UI; sourceTree = ""; @@ -213,6 +216,7 @@ 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */, + 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -345,7 +349,7 @@ CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = STLSG3FG8Q; + DEVELOPMENT_TEAM = K5UQJPP73A; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -377,7 +381,7 @@ CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = STLSG3FG8Q; + DEVELOPMENT_TEAM = K5UQJPP73A; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 17cb5b9dde942..5bde1891727ce 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -1,9 +1,19 @@ import Foundation +struct Model: Identifiable { + var id = UUID() + var name: String + var url: String + var filename: String + var status: String? +} + @MainActor class LlamaState: ObservableObject { @Published var messageLog = "" @Published var cacheCleared = false + @Published var downloadedModels: [Model] = [] + @Published var undownloadedModels: [Model] = [] let NS_PER_S = 1_000_000_000.0 private var llamaContext: LlamaContext? @@ -13,23 +23,102 @@ class LlamaState: ObservableObject { } init() { + loadModelsFromDisk() + loadDefaultModels() + } + + private func loadModelsFromDisk() { + do { + let documentsURL = getDocumentsDirectory() + let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants]) + for modelURL in modelURLs { + let modelName = modelURL.deletingPathExtension().lastPathComponent + downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded")) + } + } catch { + print("Error loading models from disk: \(error)") + } + } + + private func loadDefaultModels() { do { try loadModel(modelUrl: defaultModelUrl) } catch { messageLog += "Error!\n" } + + for model in defaultModels { + let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename) + if FileManager.default.fileExists(atPath: fileURL.path) { + + } else { + var undownloadedModel = model + undownloadedModel.status = "download" + undownloadedModels.append(undownloadedModel) + } + } } + func getDocumentsDirectory() -> URL { + let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) + return paths[0] + } + private let defaultModels: [Model] = [ + Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"), + Model( + name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)", + url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true", + filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download" + ), + + Model( + name: "TinyLlama-1.1B (F16, 2.2 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", + filename: "tinyllama-1.1b-f16.gguf", status: "download" + ), + + Model( + name: "Phi-2.7B (Q4_0, 1.6 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true", + filename: "phi-2-q4_0.gguf", status: "download" + ), + + Model( + name: "Phi-2.7B (Q8_0, 2.8 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true", + filename: "phi-2-q8_0.gguf", status: "download" + ), + + Model( + name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)", + url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true", + filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download" + ), + Model( + name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)", + url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true", + filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download" + ) + ] func loadModel(modelUrl: URL?) throws { if let modelUrl { messageLog += "Loading model...\n" llamaContext = try LlamaContext.create_context(path: modelUrl.path()) messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" + + // Assuming that the model is successfully loaded, update the downloaded models + updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded") } else { messageLog += "Load a model from the list below\n" } } + + private func updateDownloadedModels(modelName: String, status: String) { + undownloadedModels.removeAll { $0.name == modelName } + } + + func complete(text: String) async { guard let llamaContext else { return diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 7c81ea256ffd7..30c2dc4310210 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -2,115 +2,57 @@ import SwiftUI struct ContentView: View { @StateObject var llamaState = LlamaState() - @State private var multiLineText = "" - - private static func cleanupModelCaches() { - // Delete all models (*.gguf) - let fileManager = FileManager.default - let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] - do { - let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil) - for fileURL in fileURLs { - if fileURL.pathExtension == "gguf" { - try fileManager.removeItem(at: fileURL) - } - } - } catch { - print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)") - } - } + @State private var showingHelp = false // To track if Help Sheet should be shown var body: some View { - VStack { - ScrollView(.vertical, showsIndicators: true) { - Text(llamaState.messageLog) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) - .padding() - .onTapGesture { - UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) + NavigationView { + VStack { + ScrollView(.vertical, showsIndicators: true) { + Text(llamaState.messageLog) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .onTapGesture { + UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) + } } - } - TextEditor(text: $multiLineText) - .frame(height: 80) - .padding() - .border(Color.gray, width: 0.5) + TextEditor(text: $multiLineText) + .frame(height: 80) + .padding() + .border(Color.gray, width: 0.5) - HStack { - Button("Send") { - sendText() - } + HStack { + Button("Send") { + sendText() + } - Button("Bench") { - bench() - } + Button("Bench") { + bench() + } - Button("Clear") { - clear() - } + Button("Clear") { + clear() + } - Button("Copy") { - UIPasteboard.general.string = llamaState.messageLog + Button("Copy") { + UIPasteboard.general.string = llamaState.messageLog + } } - }.buttonStyle(.bordered) - - VStack(alignment: .leading) { - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)", - modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", - filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)", - modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true", - filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (F16, 2.2 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", - filename: "tinyllama-1.1b-f16.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Phi-2.7B (Q4_0, 1.6 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true", - filename: "phi-2-q4_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Phi-2.7B (Q8_0, 2.8 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true", - filename: "phi-2-q8_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)", - modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true", - filename: "mistral-7b-v0.1.Q4_0.gguf" - ) - - Button("Clear downloaded models") { - ContentView.cleanupModelCaches() - llamaState.cacheCleared = true + .buttonStyle(.bordered) + .padding() + + NavigationLink(destination: DrawerView(llamaState: llamaState)) { + Text("View Models") } + .padding() - LoadCustomButton(llamaState: llamaState) } - .padding(.top, 4) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .navigationBarTitle("Model Settings", displayMode: .inline) + } - .padding() } func sendText() { @@ -131,8 +73,73 @@ struct ContentView: View { await llamaState.clear() } } + struct DrawerView: View { + + @ObservedObject var llamaState: LlamaState + @State private var showingHelp = false + func delete(at offsets: IndexSet) { + offsets.forEach { offset in + let model = llamaState.downloadedModels[offset] + let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename) + do { + try FileManager.default.removeItem(at: fileURL) + } catch { + print("Error deleting file: \(error)") + } + } + + // Remove models from downloadedModels array + llamaState.downloadedModels.remove(atOffsets: offsets) + } + + func getDocumentsDirectory() -> URL { + let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) + return paths[0] + } + var body: some View { + List { + Section(header: Text("Download Models From Hugging Face")) { + HStack { + InputButton(llamaState: llamaState) + } + } + Section(header: Text("Downloaded Models")) { + ForEach(llamaState.downloadedModels) { model in + DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename) + } + .onDelete(perform: delete) + } + Section(header: Text("Default Models")) { + ForEach(llamaState.undownloadedModels) { model in + DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename) + } + } + + } + .listStyle(GroupedListStyle()) + .navigationBarTitle("Model Settings", displayMode: .inline).toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Help") { + showingHelp = true + } + } + }.sheet(isPresented: $showingHelp) { // Sheet for help modal + VStack(alignment: .leading) { + VStack(alignment: .leading) { + Text("1. Make sure the model is in GGUF Format") + .padding() + Text("2. Copy the download link of the quantized model") + .padding() + } + Spacer() + } + } + } + } } -//#Preview { -// ContentView() -//} +struct ContentView_Previews: PreviewProvider { + static var previews: some View { + ContentView() + } +} diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift index c9f322ca14e72..4584d6eaa3d32 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift @@ -53,6 +53,8 @@ struct DownloadButton: View { llamaState.cacheCleared = false + let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded") + llamaState.downloadedModels.append(model) status = "downloaded" } } catch let err { diff --git a/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift b/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift new file mode 100644 index 0000000000000..c5ffbad4ec331 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift @@ -0,0 +1,131 @@ +import SwiftUI + +struct InputButton: View { + @ObservedObject var llamaState: LlamaState + @State private var inputLink: String = "" + @State private var status: String = "download" + @State private var filename: String = "" + + @State private var downloadTask: URLSessionDownloadTask? + @State private var progress = 0.0 + @State private var observation: NSKeyValueObservation? + + private static func extractModelInfo(from link: String) -> (modelName: String, filename: String)? { + guard let url = URL(string: link), + let lastPathComponent = url.lastPathComponent.components(separatedBy: ".").first, + let modelName = lastPathComponent.components(separatedBy: "-").dropLast().joined(separator: "-").removingPercentEncoding, + let filename = lastPathComponent.removingPercentEncoding else { + return nil + } + + return (modelName, filename) + } + + private static func getFileURL(filename: String) -> URL { + FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename) + } + + private func download() { + guard let extractedInfo = InputButton.extractModelInfo(from: inputLink) else { + // Handle invalid link or extraction failure + return + } + + let (modelName, filename) = extractedInfo + self.filename = filename // Set the state variable + + status = "downloading" + print("Downloading model \(modelName) from \(inputLink)") + guard let url = URL(string: inputLink) else { return } + let fileURL = InputButton.getFileURL(filename: filename) + + downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in + if let error = error { + print("Error: \(error.localizedDescription)") + return + } + + guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else { + print("Server error!") + return + } + + do { + if let temporaryURL = temporaryURL { + try FileManager.default.copyItem(at: temporaryURL, to: fileURL) + print("Writing to \(filename) completed") + + llamaState.cacheCleared = false + + let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded") + llamaState.downloadedModels.append(model) + status = "downloaded" + } + } catch let err { + print("Error: \(err.localizedDescription)") + } + } + + observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in + self.progress = progress.fractionCompleted + } + + downloadTask?.resume() + } + + var body: some View { + VStack { + HStack { + TextField("Paste Quantized Download Link", text: $inputLink) + .textFieldStyle(RoundedBorderTextFieldStyle()) + + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("Cancel") + } + } + + if status == "download" { + Button(action: download) { + Text("Download Custom Model") + } + } else if status == "downloading" { + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("Downloading \(Int(progress * 100))%") + } + } else if status == "downloaded" { + Button(action: { + let fileURL = InputButton.getFileURL(filename: self.filename) + if !FileManager.default.fileExists(atPath: fileURL.path) { + download() + return + } + do { + try llamaState.loadModel(modelUrl: fileURL) + } catch let err { + print("Error: \(err.localizedDescription)") + } + }) { + Text("Load Custom Model") + } + } else { + Text("Unknown status") + } + } + .onDisappear() { + downloadTask?.cancel() + } + .onChange(of: llamaState.cacheCleared) { newValue in + if newValue { + downloadTask?.cancel() + let fileURL = InputButton.getFileURL(filename: self.filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + } + } +} From 930f907d3ece1eb5b0a1ec5e209983a66dcbfa68 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 12 Jan 2024 18:54:53 +0100 Subject: [PATCH 032/131] export-lora : use LLAMA_FILE_MAGIC_GGLA (#4894) This commit replaces the magic number used in export-lora.cpp with the one defined in llama.h, which is indirectly included via common.h. Signed-off-by: Daniel Bevenius --- examples/export-lora/export-lora.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 58fbe204d3bbb..4cd5d99bb21ec 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -245,9 +245,8 @@ static struct lora_data * load_lora(struct lora_info * info) { params_ggml.no_alloc = true; result->ctx = ggml_init(params_ggml); - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_LORA) { + if (magic != LLAMA_FILE_MAGIC_GGLA) { die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); } uint32_t version = file.read_u32(); From 584d674be622fbf1578694ada6e62eebedbfd377 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 20:54:12 +0200 Subject: [PATCH 033/131] llama : remove redundant assert for StableLM (#4901) --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 29f8873f629e5..ce413f605163c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5530,7 +5530,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; From e7e4df031b9e29d4b55a4e0b0295187f6b213db1 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 12 Jan 2024 20:07:38 +0100 Subject: [PATCH 034/131] llama : ggml-backend integration (#4766) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama : ggml-backend integration * ggml-backend : add names to buffers * fix unmap after loading * batched-bench : add tensor_split param * llama : check for null tensor_split * ggml-backend : increase GGML_MAX_BACKENDS * improve graph splitting, partial fix for --no-kv-offload * cuda : add ggml-backend split buffer support * cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available) * ggml : fix null backend dereference (#4807) * ggml : fix null backend dereference * ggml : also check ggml_backend_is_cpu * test-backend-ops : check buffer allocation failures * llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row) * ggml : fix mul_mat_id work size * llama : rewrite session kv load/set without graphs * minor * llama : only initialize used backends, free backends on context free * llama : abort ctx if cuda backend init fails * llama : rewrite lora with ggml-backend and compute on CPU ggml-ci * llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer * opencl : add ggml-backend buffer type * cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf) * llama : on Metal, by default offload the full model ggml-ci * metal : page align the data ptr (#4854) * Apply suggestions from code review Co-authored-by: Johannes Gäßler * cuda : fix split buffer free * address review comments * llama-bench : add split-mode parameter * fix whitespace * opencl : fix double initialization * server : add --split-mode parameter * use async copy and compute to improve multi-gpu performance ggml-ci * use async memcpys to copy the graph outputs to the CPU * fix opencl * use a host buffer for the cpu compute buffer for faster copies to the gpu --------- Co-authored-by: Georgi Gerganov Co-authored-by: Johannes Gäßler --- common/common.cpp | 65 +- common/common.h | 1 + examples/batched-bench/batched-bench.cpp | 3 + examples/llama-bench/llama-bench.cpp | 146 +- examples/server/server.cpp | 40 +- ggml-alloc.c | 34 +- ggml-alloc.h | 4 +- ggml-backend-impl.h | 38 +- ggml-backend.c | 685 ++++--- ggml-backend.h | 60 +- ggml-cuda.cu | 901 +++++---- ggml-cuda.h | 26 +- ggml-impl.h | 2 + ggml-metal.m | 55 +- ggml-opencl.cpp | 335 +++- ggml-opencl.h | 16 +- ggml.c | 30 +- ggml.h | 9 +- llama.cpp | 2314 +++++++++------------- llama.h | 18 +- tests/test-backend-ops.cpp | 26 +- 21 files changed, 2523 insertions(+), 2285 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 062a8b4deb7d1..322b9f91e5041 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -543,9 +543,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -554,9 +553,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers_draft = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -565,25 +563,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - + if (split_arg.size() >= LLAMA_MAX_DEVICES) { + invalid_param = true; + break; + } for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); @@ -591,14 +608,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -915,14 +926,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" number of layers to store in VRAM\n"); printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT, --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CUBLAS + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif printf(" -gan N, --grp-attn-n N\n"); printf(" group-attention factor (default: %d)\n", params.grp_attn_n); @@ -1041,6 +1053,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; diff --git a/common/common.h b/common/common.h index 1359e76ab4648..f29be5b5ab87f 100644 --- a/common/common.h +++ b/common/common.h @@ -59,6 +59,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 57596ed986050..7924db267401c 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -88,7 +88,10 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); + const std::vector t_split (LLAMA_MAX_DEVICES, 0.0f); + model_params.n_gpu_layers = n_gpu_layers; + model_params.tensor_split = t_split.data(); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7f7186cded527..97325b5bd634f 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -128,6 +128,25 @@ static std::string get_gpu_info() { // command line params enum output_formats {CSV, JSON, MARKDOWN, SQL}; +static const char * output_format_str(output_formats format) { + switch (format) { + case CSV: return "csv"; + case JSON: return "json"; + case MARKDOWN: return "md"; + case SQL: return "sql"; + default: GGML_ASSERT(!"invalid output format"); + } +} + +static const char * split_mode_str(llama_split_mode mode) { + switch (mode) { + case LLAMA_SPLIT_NONE: return "none"; + case LLAMA_SPLIT_LAYER: return "layer"; + case LLAMA_SPLIT_ROW: return "row"; + default: GGML_ASSERT(!"invalid split mode"); + } +} + struct cmd_params { std::vector model; std::vector n_prompt; @@ -137,6 +156,7 @@ struct cmd_params { std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; + std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector mul_mat_q; @@ -155,6 +175,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, + /* split_mode */ {LLAMA_SPLIT_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -169,21 +190,22 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - printf(" -ts, --tensor_split \n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); + printf(" -ts, --tensor_split (default: 0)\n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } @@ -306,6 +328,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; @@ -392,6 +436,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } @@ -410,6 +455,7 @@ struct cmd_params_instance { ggml_type type_v; int n_threads; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; bool mul_mat_q; @@ -419,6 +465,7 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; + mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); @@ -428,6 +475,7 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && + split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split; } @@ -446,45 +494,13 @@ struct cmd_params_instance { } }; -static std::vector get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) { - std::vector instances; - - for (const auto & m : params.model) - for (const auto & nl : params.n_gpu_layers) - for (const auto & mg : params.main_gpu) - for (const auto & ts : params.tensor_split) - for (const auto & nb : params.n_batch) - for (const auto & tk : params.type_k) - for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) - for (const auto & nkvo : params.no_kv_offload) - for (const auto & nt : params.n_threads) { - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, - /* .tensor_split = */ ts, - }; - instances.push_back(instance); - } - return instances; -} - static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; -#if 1 // this ordering minimizes the number of times that each model needs to be reloaded for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) for (const auto & nb : params.n_batch) @@ -506,6 +522,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, @@ -527,6 +544,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, @@ -535,24 +553,6 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } -#else - // this ordering separates the prompt and generation tests - for (const auto & n_prompt : params.n_prompt) { - if (n_prompt == 0) { - continue; - } - auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt); - instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end()); - } - - for (const auto & n_gen : params.n_gen) { - if (n_gen == 0) { - continue; - } - auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); - instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); - } -#endif return instances; } @@ -576,6 +576,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; bool mul_mat_q; @@ -597,6 +598,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; @@ -660,7 +662,8 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "no_kv_offload", + "n_gpu_layers", "split_mode", + "main_gpu", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", @@ -711,7 +714,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(n_gpu_layers), split_mode_str(split_mode), + std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -867,6 +871,9 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return "ngl"; } + if (field == "split_mode") { + return "sm"; + } if (field == "n_threads") { return "threads"; } @@ -907,6 +914,9 @@ struct markdown_printer : public printer { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { fields.push_back("main_gpu"); } + if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { + fields.push_back("split_mode"); + } if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1d30a15a6cc1e..c1ab8f9dc477c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2005,12 +2005,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); #endif printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); @@ -2253,6 +2256,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, "See main README.md for information on enabling GPU BLAS support", {{"n_gpu_layers", params.n_gpu_layers}}); #endif + } + else if (arg == "--split-mode" || arg == "-sm") + { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") + { + params.split_mode = LLAMA_SPLIT_NONE; + } + else if (arg_next == "layer") + { + params.split_mode = LLAMA_SPLIT_LAYER; + } + else if (arg_next == "row") + { + params.split_mode = LLAMA_SPLIT_ROW; + } + else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { diff --git a/ggml-alloc.c b/ggml-alloc.c index a27dd54b0eb06..89b85d34870d7 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } - AT_PRINTF("block %d\n", best_fit_block); - if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; @@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { return; } } + struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; block->addr = (char*)block->addr + size; @@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } + AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); + tensor->data = addr; tensor->buffer = alloc->buffer; if (!alloc->measure) { @@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + ggml_backend_buffer_reset(alloc->buffer); } } @@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { return alloc; } -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { +ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1); + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); // TODO: move alloc initialization to a common ggml_tallocr_new_impl function ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); @@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe return alloc; } -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size); +ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { + return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); +} + +ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { + // create a backend buffer to get the correct tensor allocation sizes + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); alloc->buffer_owned = true; return alloc; } +ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { + return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); +} + ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); @@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (nbytes == 0) { // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif return NULL; } ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); + if (buffer == NULL) { + // failed to allocate buffer +#ifndef NDEBUG + fprintf(stderr, "%s: failed to allocate buffer\n", __func__); +#endif + return NULL; + } + ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { diff --git a/ggml-alloc.h b/ggml-alloc.h index 64a412468915b..4e59975213406 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t; GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer +GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft); GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index ca21b474372a6..1db32901fe6c7 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -16,9 +16,10 @@ extern "C" { typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { + const char * (*get_name) (ggml_backend_buffer_type_t buft); ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) @@ -34,16 +35,15 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer) (ggml_backend_buffer_t buffer); - //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + const char * (*get_name) (ggml_backend_buffer_t buffer); + void (*free_buffer)(ggml_backend_buffer_t buffer); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -51,6 +51,7 @@ extern "C" { ggml_backend_buffer_type_t buft; ggml_backend_buffer_context_t context; size_t size; + enum ggml_backend_buffer_usage usage; }; ggml_backend_buffer_t ggml_backend_buffer_init( @@ -59,6 +60,8 @@ extern "C" { ggml_backend_buffer_context_t context, size_t size); + // do not use directly, use ggml_backend_tensor_copy instead + bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // // Backend @@ -74,22 +77,20 @@ extern "C" { // buffer allocation ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); - // (optional) asynchroneous tensor data access + // (optional) asynchronous tensor data access void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); - // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - + // (optional) complete all pending operations void (*synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph without a plan + // compute graph without a plan (async) bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation @@ -102,7 +103,6 @@ extern "C" { ggml_backend_context_t context; }; - // // Backend registry // diff --git a/ggml-backend.c b/ggml-backend.c index 53e741cb892f8..4c2d8b0b26f18 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -15,6 +15,10 @@ // backend buffer type +const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); +} + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY }; return buffer; } +const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name(buffer); +} + void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer == NULL) { return; @@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t } size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); + return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { } bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } -ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { +void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { + buffer->usage = usage; +} + +ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { return buffer->buft; } +void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { + if (buffer->iface.reset) { + buffer->iface.reset(buffer); + } +} + +bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; + if (dst_buf->iface.cpy_tensor) { + return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + } + return false; +} + // backend const char * ggml_backend_name(ggml_backend_t backend) { @@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - backend->iface.set_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.set_tensor_async == NULL) { + ggml_backend_tensor_set(tensor, data, offset, size); + } else { + backend->iface.set_tensor_async(backend, tensor, data, offset, size); + } } void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - backend->iface.get_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.get_tensor_async == NULL) { + ggml_backend_tensor_get(tensor, data, offset, size); + } else { + backend->iface.get_tensor_async(backend, tensor, data, offset, size); + } } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); } void ggml_backend_synchronize(ggml_backend_t backend) { @@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { backend->iface.graph_plan_compute(backend, plan); - - // TODO: optional sync - ggml_backend_synchronize(backend); } bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - if (!backend->iface.graph_compute(backend, cgraph)) { - return false; - } - - // TODO: optional sync - ggml_backend_synchronize(backend); - return true; + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml } void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { - //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); - //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); - if (src == dst) { return; } - // TODO: allow backends to support copy to/from same backend - - if (dst->buffer->iface.cpy_tensor_from != NULL) { - dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst); - } else if (src->buffer->iface.cpy_tensor_to != NULL) { - src->buffer->iface.cpy_tensor_to(src->buffer, src, dst); - } else { - // shouldn't be hit when copying from/to CPU - #ifndef NDEBUG - fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to " - "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name); - #endif + if (ggml_backend_buffer_is_host(src->buffer)) { + ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); + } else if (ggml_backend_buffer_is_host(dst->buffer)) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); +#endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); ggml_backend_tensor_get(src, data, 0, nbytes); @@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst } } +void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); + + if (src == dst) { + return; + } + + if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { + if (backend->iface.cpy_tensor_async != NULL) { + if (backend->iface.cpy_tensor_async(backend, src, dst)) { + return; + } + } + } + + size_t nbytes = ggml_nbytes(src); + if (ggml_backend_buffer_is_host(src->buffer)) { + ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); + } + else { + ggml_backend_tensor_copy(src, dst); + } +} + + // backend registry #define GGML_MAX_BACKENDS_REG 16 @@ -392,6 +439,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU +static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { + return "CPU"; + + GGML_UNUSED(buffer); +} + static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } @@ -412,14 +465,12 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); +static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; GGML_UNUSED(buffer); } @@ -429,30 +480,38 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static struct ggml_backend_buffer_i cpu_backend_buffer_i = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // for buffers from ptr, free is not called static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU"; + + GGML_UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -483,6 +542,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -501,6 +561,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { + return "CPU_HBM"; + + GGML_UNUSED(buf); +} + static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } @@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return NULL; } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; } -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, @@ -661,7 +733,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cpu_name; + return backend && backend->iface.get_name == ggml_backend_cpu_name; } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { @@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user // scheduler -#define GGML_MAX_BACKENDS 4 +#define GGML_MAX_BACKENDS 16 #define GGML_MAX_SPLITS 256 #define GGML_MAX_SPLIT_INPUTS 16 @@ -695,21 +767,29 @@ struct ggml_backend_sched_split { int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; int n_inputs; + // graph view of this split struct ggml_cgraph graph; }; struct ggml_backend_sched { + bool is_reset; // true if the scheduler has been reset since the last graph split + int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; + ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS]; ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; ggml_gallocr_t galloc; + // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - ggml_tallocr_t * node_talloc; // [hash_set.size] - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] + // hash values (arrays of [hash_set.size]) + ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) + struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // copy of the graph with modified inputs struct ggml_cgraph * graph; + struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; int n_splits; @@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) return INT_MAX; } -static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { +static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { if (buffer == NULL) { return NULL; } + + // check if this is already allocate in a allocr buffer (from user manual allocations) + for (int i = 0; i < sched->n_backends; i++) { + if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { + return sched->tallocs[i]; + } + } + // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->backends[i]; + return sched->tallocs[i]; } } GGML_ASSERT(false && "tensor buffer type not supported by any backend"); @@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc if (allocr == NULL) { return NULL; } - // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (sched->tallocs[i] == allocr) { return sched->backends[i]; @@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc } #if 0 -static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN #endif // returns the backend that should be used for the node based on the current locations -static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { - // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there - // ie. kv cache updates - // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. +static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { + // assign pre-allocated nodes to their backend // dst - ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer); - if (cur_backend != NULL) { + ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); + if (cur_allocr != NULL) { SET_CAUSE(node, "1.dst"); - return cur_backend; + return cur_allocr; } - // view_src - if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) { - SET_CAUSE(node, "1.vsrc"); - return get_buffer_backend(sched, node->view_src->buffer); + if (node->view_src != NULL) { + cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); + if (cur_allocr != NULL) { + SET_CAUSE(node, "1.vsrc"); + return cur_allocr; + } } - - // src - int cur_prio = INT_MAX; - size_t cur_size = 0; - + // assign nodes that use weights to the backend of the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = node->src[i]; if (src == NULL) { break; } - ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer); - if (src_backend != NULL) { - int src_prio = sched_backend_prio(sched, src_backend); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - cur_backend = src_backend; - SET_CAUSE(node, "1.src%d", i); - } + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + // operations with weights are always run on the same backend as the weights + SET_CAUSE(node, "1.wgt%d", i); + return src_allocr; } } - return cur_backend; + + return NULL; } static char * fmt_size(size_t size) { @@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t node_allocr = node_allocr(node); ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t src_allocr = node_allocr(src); ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; - fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, + fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } fprintf(stderr, "\n"); @@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co return dup; } + +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -// TODO: merge passes static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - // reset state - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + // reset splits sched->n_splits = 0; + sched->is_reset = false; struct ggml_init_params params = { /* .mem_size = */ sizeof(sched->context_buffer), @@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g /* .no_alloc = */ true }; - if (sched->ctx != NULL) { - ggml_free(sched->ctx); - } + ggml_free(sched->ctx); sched->ctx = ggml_init(params); + if (sched->ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + GGML_ASSERT(false); + } - // pass 1: assign backends to ops with allocated inputs + // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; if (node_allocr(leaf) != NULL) { // do not overwrite user assignments continue; } - ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer); - if (leaf_backend == NULL && leaf->view_src != NULL) { - leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer); - } - if (leaf_backend != NULL) { - node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend); - } + node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { @@ -926,50 +1003,102 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // do not overwrite user assignments continue; } - ggml_backend_t node_backend = sched_backend_from_cur(sched, node); - if (node_backend != NULL) { - node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); + node_allocr(node) = sched_allocr_from_cur(sched, node); + // src + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (node_allocr(src) == NULL) { + node_allocr(src) = sched_allocr_from_cur(sched, src); + } } } - //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS1 + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 2: assign backends to ops from current assignments - // TODO: - // - reuse sched_backend_from_cur - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { - int cur_prio = INT_MAX; - size_t cur_size = 0; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; + // pass 2: expand current backend assignments + // assign the same backend to adjacent nodes + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) + // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops + + // pass 2.1 expand gpu up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != NULL) { - int src_prio = sched_allocr_prio(sched, src_allocr); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - node_allocr = src_allocr; - SET_CAUSE(node, "2.src%d", j); - } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.1"); + } + } + } + + // pass 2.2 expand gpu down + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.2"); } + } + } + + // pass 2.3 expand rest up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { - node_allocr(node) = node_allocr; + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.3"); } } } - //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS2 + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 3: assign backends to remaining src from dst (should only be leafs) + // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); + ggml_tallocr_t cur_allocr = node_allocr(node); + if (node->view_src != NULL && cur_allocr == NULL) { + cur_allocr = node_allocr(node) = node_allocr(node->view_src); + SET_CAUSE(node, "3.vsrc"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -977,81 +1106,105 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } ggml_tallocr_t src_allocr = node_allocr(src); if (src_allocr == NULL) { - node_allocr(src) = node_allocr; + if (src->view_src != NULL) { + // views are always on the same backend as the source + node_allocr(src) = node_allocr(src->view_src); + SET_CAUSE(src, "3.vsrc"); + } else { + node_allocr(src) = cur_allocr; + SET_CAUSE(src, "3.cur"); + } } } } - //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS3 + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 4: split graph, find tensors that need to be copied - // TODO: - // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost - // find first backend - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (node->view_src == NULL) { - sched->splits[0].tallocr = node_allocr(node); - break; + { + int cur_split = 0; + // find the backend of the first split, skipping view ops + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (!ggml_is_view_op(node->op)) { + sched->splits[0].tallocr = node_allocr(node); + break; + } } - } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + + if (ggml_is_view_op(node->op)) { + continue; + } - if (ggml_is_view_op(node->op)) { - continue; - } + ggml_tallocr_t node_allocr = node_allocr(node); + + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + GGML_ASSERT(cur_split < GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); + } - ggml_tallocr_t node_allocr = node_allocr(node); + // find inputs that are not on the same backend + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now + if (src_allocr != node_allocr) { + // check if the input is already in the split + bool found = false; + for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { + if (sched->splits[cur_split].inputs[k] == src) { + found = true; + break; + } + } - if (node_allocr != cur_allocr) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); - } + if (!found) { + int n_inputs = sched->splits[cur_split].n_inputs++; + //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr))); + GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = src; + } - // find inputs that are not on the same backend - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; - } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr) { - int n_inputs = sched->splits[cur_split].n_inputs++; - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; - - // create copies - size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); - ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + // create a copy of the input in the split's backend + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + SET_CAUSE(tensor_copy, "4.cpy"); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; } - node->src[j] = sched->node_copies[id][cur_backend_id]; } } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; - - //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); +#ifdef DEBUG_PASS4 + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif -#if 1 +#ifndef NDEBUG // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1059,6 +1212,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (node_allocr == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } + if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", + node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1070,8 +1228,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); } + if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", + src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", + src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + } } } + fflush(stderr); #endif // create copies of the graph for each split @@ -1085,6 +1249,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + // add a dependency to the input source so that it is not freed before the copy is done + GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); input_cpy->src[0] = input; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } @@ -1119,24 +1285,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; - if (input->buffer == NULL) { - if (input->view_src == NULL) { - fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); - exit(1); - } - // FIXME: may need to use the sched buffer instead - ggml_backend_view_init(input->view_src->buffer, input); - } - if (input_cpy->buffer == NULL) { - fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); - exit(1); - } - //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); - //GGML_ASSERT(input_cpy->buffer->backend == split_backend); - ggml_backend_tensor_copy(input, input_cpy); + struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + + GGML_ASSERT(input->buffer != NULL); + GGML_ASSERT(input_cpy->buffer != NULL); + + // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change + // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times + ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } - // ggml_backend_synchronize(split_backend); + //ggml_backend_synchronize(split_backend); // necessary to measure copy time int64_t copy_end_us = ggml_time_us(); copy_us[split_backend_id] += copy_end_us - copy_start_us; @@ -1148,7 +1306,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); ggml_backend_graph_compute(split_backend, &split->graph); - // ggml_backend_synchronize(split_backend); + //ggml_backend_synchronize(split_backend); // necessary to measure compute time uint64_t compute_end_us = ggml_time_us(); compute_us[split_backend_id] += compute_end_us - compute_start_us; } @@ -1168,26 +1326,41 @@ static void sched_reset(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_backends; i++) { ggml_tallocr_reset(sched->tallocs[i]); } + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + + sched->is_reset = true; } -ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { +ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { + GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); - struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); - memset(sched, 0, sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + + // initialize hash table + sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); + sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { sched->backends[i] = backends[i]; + sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]); } sched->galloc = ggml_gallocr_new(); // init measure allocs for each backend for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]); + sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]); } + sched_reset(sched); + return sched; } @@ -1199,6 +1372,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_tallocr_free(sched->tallocs[i]); } ggml_gallocr_free(sched->galloc); + ggml_free(sched->ctx); free(sched->hash_set.keys); free(sched->node_talloc); free(sched->node_copies); @@ -1206,12 +1380,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - // initialize hash tables - size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; - sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); @@ -1220,28 +1389,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr for (int i = 0; i < sched->n_backends; i++) { size_t size = ggml_tallocr_max_size(sched->tallocs[i]); ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size); + sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size); } sched_reset(sched); } void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + + if (!sched->is_reset) { + sched_reset(sched); + } sched_split_graph(sched, graph); sched_alloc_splits(sched); sched_compute_splits(sched); +} + +void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched_reset(sched); } +int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { + return sched->n_splits; +} + ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return sched->tallocs[backend_index]; } ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); } @@ -1251,10 +1433,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml node_allocr(node) = sched->tallocs[backend_index]; } +ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { + ggml_tallocr_t allocr = node_allocr(node); + if (allocr == NULL) { + return NULL; + } + return get_allocr_backend(sched, allocr); +} + // utils + void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); @@ -1320,6 +1511,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { + graph_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1353,6 +1545,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_allocated = ggml_init(params); struct ggml_context * ctx_unallocated = ggml_init(params); + if (ctx_allocated == NULL || ctx_unallocated == NULL) { + fprintf(stderr, "failed to allocate context for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } + // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1361,6 +1568,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); + if (buffer == NULL) { + fprintf(stderr, "failed to allocate buffer for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024); @@ -1397,8 +1618,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); + if (copy.buffer == NULL) { + return false; + } + struct ggml_cgraph * g1 = graph; struct ggml_cgraph * g2 = copy.graph; @@ -1428,4 +1653,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t } ggml_backend_graph_copy_free(copy); + + return true; } diff --git a/ggml-backend.h b/ggml-backend.h index 85ff67b0ea843..4eb244af1d3e7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -17,22 +17,31 @@ extern "C" { // // buffer type - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); + enum ggml_backend_buffer_usage { + GGML_BACKEND_BUFFER_USAGE_ANY = 0, + GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, + }; + + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); // // Backend @@ -140,23 +149,24 @@ extern "C" { typedef struct ggml_backend_sched * ggml_backend_sched_t; // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends); - - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + // Get the number of splits of the last graph + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); - // Allocate a graph on the backend scheduler - GGML_API void ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph); + // Allocate and compute graph on the backend scheduler + GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // // Utils @@ -176,7 +186,7 @@ extern "C" { typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends - GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a345b0c4a70ac..2db50437c0d65 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8,8 +8,13 @@ #include #include #include +#include #include - +#include +#include +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" #if defined(GGML_USE_HIPBLAS) #include @@ -77,6 +82,7 @@ #define cudaMemcpyKind hipMemcpyKind #define cudaMemset hipMemset #define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags @@ -112,10 +118,6 @@ #endif // defined(GGML_USE_HIPBLAS) -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" - #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CC_PASCAL 600 @@ -564,7 +566,7 @@ static void ggml_cuda_set_device(const int device) { static int g_device_count = -1; static int g_main_device = 0; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static std::array g_default_tensor_split = {}; struct cuda_device_capabilities { int cc; // compute capability @@ -575,10 +577,6 @@ struct cuda_device_capabilities { static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} }; -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 0; // disabled by default -static size_t g_scratch_offset = 0; - static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; [[noreturn]] @@ -7548,8 +7546,9 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); - g_tensor_split[id] = total_vram; + g_default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; + #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else @@ -7558,7 +7557,7 @@ void ggml_init_cublas() { g_device_caps[id].smpb = prop.sharedMemPerBlock; } for (int id = 0; id < g_device_count; ++id) { - g_tensor_split[id] /= total_vram; + g_default_tensor_split[id] /= total_vram; } for (int id = 0; id < g_device_count; ++id) { @@ -7582,30 +7581,6 @@ void ggml_init_cublas() { } } -void ggml_cuda_set_tensor_split(const float * tensor_split) { - if (tensor_split == nullptr) { - return; - } - bool all_zero = true; - for (int i = 0; i < g_device_count; ++i) { - if (tensor_split[i] != 0.0f) { - all_zero = false; - break; - } - } - if (all_zero) { - return; - } - float split_sum = 0.0f; - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] /= split_sum; - } -} - void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; @@ -8057,11 +8032,11 @@ static void ggml_cuda_op_mul_mat_q( (void) src1_ddf_i; } -static int64_t get_row_rounding(ggml_type type) { +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; for (int id = 0; id < g_device_count; ++id) { - if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { if (min_compute_capability > g_device_caps[id].cc) { min_compute_capability = g_device_caps[id].cc; } @@ -8122,6 +8097,21 @@ static int64_t get_row_rounding(ggml_type type) { #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + + if (id == g_device_count - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + static void ggml_cuda_op_mul_mat_vec_q( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, @@ -8739,6 +8729,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { peer_access_enabled = enable_peer_access; } +// FIXME: move this somewhere else +struct ggml_backend_cuda_split_buffer_type_context { + std::array tensor_split; +}; + static void ggml_cuda_op_mul_mat( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const bool convert_src1_to_q8_1) { @@ -8790,6 +8785,14 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); + std::array tensor_split; + if (split) { + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + tensor_split = buft_ctx->tensor_split; + } + struct dev_data { cuda_pool_alloc src0_dd_alloc; cuda_pool_alloc src1_ddf_alloc; @@ -8817,17 +8820,17 @@ static void ggml_cuda_op_mul_mat( // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { - const int64_t rounding = get_row_rounding(src0->type); + const int64_t rounding = get_row_rounding(src0->type, tensor_split); if (id != 0) { - dev[id].row_low = ne01*g_tensor_split[id]; + dev[id].row_low = ne01*tensor_split[id]; if (dev[id].row_low < ne01) { dev[id].row_low -= dev[id].row_low % rounding; } } if (id != g_device_count - 1) { - dev[id].row_high = ne01*g_tensor_split[id + 1]; + dev[id].row_high = ne01*tensor_split[id + 1]; if (dev[id].row_high < ne01) { dev[id].row_high -= dev[id].row_high % rounding; } @@ -9373,10 +9376,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; - for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_device_caps[id].cc; + + if (split) { + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + auto & tensor_split = buft_ctx->tensor_split; + for (int id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_device_caps[id].cc; + } } + } else { + min_compute_capability = g_device_caps[g_main_device].cc; } #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) @@ -9415,7 +9425,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { @@ -9877,247 +9887,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { - const int64_t nrows = ggml_nrows(tensor); - - const int64_t ne0 = tensor->ne[0]; - - const size_t nb1 = tensor->nb[1]; - - ggml_backend_type backend = tensor->backend; - ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - - for (int id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device) { - continue; - } - - ggml_cuda_set_device(id); - - int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { - row_low = 0; - row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { - const int64_t rounding = get_row_rounding(tensor->type); - - row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % rounding; - - if (id == g_device_count - 1) { - row_high = nrows; - } else { - row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % rounding; - } - } else { - GGML_ASSERT(false); - } - if (row_low == row_high) { - continue; - } - - int64_t nrows_split = row_high - row_low; - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char *)data + offset_split; - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); - - extra->data_device[id] = buf; - - if (backend == GGML_BACKEND_GPU_SPLIT) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); - } - } - } - - tensor->extra = extra; -} - -void ggml_cuda_free_data(struct ggml_tensor * tensor) { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { - return; - } - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - - for (int id = 0; id < g_device_count; ++id) { - ggml_cuda_set_device(id); - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - } - - delete extra; -} - -static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; -static size_t g_temp_tensor_extra_index = 0; - -static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { - if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; - } - - size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; - ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; - memset(extra, 0, sizeof(*extra)); - - return extra; -} - -static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { - if (scratch && g_scratch_size == 0) { - return; - } - - tensor->backend = GGML_BACKEND_GPU; - - // recursively assign CUDA buffers until a compute tensor is found - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { - const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { - ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); - } - } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); - } - - if (scratch && no_alloc) { - return; - } - - ggml_tensor_extra_gpu * extra; - - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW || - force_inplace; - const size_t size = ggml_nbytes(tensor); - - ggml_cuda_set_device(g_main_device); - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->op_params, sizeof(size_t)); - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src0_ddc + offset; - } else if (tensor->op == GGML_OP_CPY) { - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; - void * src1_ddv = src1_extra->data_device[g_main_device]; - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src1_ddv; - } else if (scratch) { - GGML_ASSERT(size <= g_scratch_size); - if (g_scratch_offset + size > g_scratch_size) { - g_scratch_offset = 0; - } - - char * data = (char *) g_scratch_buffer; - if (data == nullptr) { - CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); - g_scratch_buffer = data; - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = data + g_scratch_offset; - - g_scratch_offset += size; - - GGML_ASSERT(g_scratch_offset <= g_scratch_size); - } else { // allocate new buffers outside of scratch - void * data; - CUDA_CHECK(cudaMalloc(&data, size)); - CUDA_CHECK(cudaMemset(data, 0, size)); - extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - extra->data_device[g_main_device] = data; - } - - tensor->extra = extra; -} - -void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) { - if (g_scratch_size == 0) { - return; - } - if (g_scratch_buffer == nullptr) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); - } - - ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - - const bool inplace = tensor->view_src != nullptr; - - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t view_offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&view_offset, tensor->op_params, sizeof(size_t)); - } - extra->data_device[g_main_device] = src0_ddc + view_offset; - } else { - extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; - } - - tensor->extra = extra; -} - -void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - GGML_ASSERT(ggml_is_contiguous(tensor)); - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); -} - -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, false); -} - -void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, true); -} - -void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, false, false); -} - -void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, true, false); -} - -void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10126,28 +9896,10 @@ void ggml_cuda_set_main_device(const int main_device) { if (g_main_device != main_device && g_device_count > 1) { g_main_device = main_device; - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); - fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); - } -} - -void ggml_cuda_set_scratch_size(const size_t scratch_size) { - // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously - // it still won't always work as expected, but it's better than nothing - if (scratch_size > g_scratch_size) { - ggml_cuda_free_scratch(); - } - g_scratch_size = std::max(g_scratch_size, scratch_size); -} - -void ggml_cuda_free_scratch() { - if (g_scratch_buffer == nullptr) { - return; + //cudaDeviceProp prop; + //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); + //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); } - - CUDA_CHECK(cudaFree(g_scratch_buffer)); - g_scratch_buffer = nullptr; } bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -10328,21 +10080,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des #define UNUSED GGML_UNUSED +struct ggml_backend_cuda_context { + int device; + std::string name; +}; + // cuda buffer -struct ggml_backend_buffer_context_cuda { +struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; + std::string name; - ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : + device(device), dev_ptr(dev_ptr), + name(GGML_CUDA_NAME + std::to_string(device)) { + } - ~ggml_backend_buffer_context_cuda() { + ~ggml_backend_cuda_buffer_context() { delete[] temp_tensor_extras; } ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset if (temp_tensor_extras == nullptr) { temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } @@ -10356,19 +10118,28 @@ struct ggml_backend_buffer_context_cuda { } }; +static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; +} + static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { assert(tensor->view_src->buffer->buft == buffer->buft); @@ -10397,14 +10168,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0])); } } - - UNUSED(buffer); } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); @@ -10415,49 +10184,82 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + if (ggml_backend_buffer_is_cuda(src->buffer)) { + ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; + ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(src_ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_set_device(dst_ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); + + return true; + } + return false; } static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); + CUDA_CHECK(cudaDeviceSynchronize()); } -static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { +static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor_from = */ NULL, - /* .cpy_tensor_to = */ NULL, + /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type +struct ggml_backend_cuda_buffer_type_context { + int device; + std::string name; +}; + +static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - int device = (int) (intptr_t) buft->context; + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_cuda_set_device(device); + ggml_cuda_set_device(buft_ctx->device); size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 void * dev_ptr; - CUDA_CHECK(cudaMalloc(&dev_ptr, size)); + cudaError_t err = cudaMalloc(&dev_ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + return nullptr; + } - ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); + ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); - return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -10466,7 +10268,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10487,21 +10289,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t } static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); + if (!ggml_backend_is_cuda(backend)) { + return false; + } - UNUSED(buft); + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + return buft_ctx->device == cuda_ctx->device; } static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, - /* .is_host = */ nullptr, + /* .is_host = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + // FIXME: this is not thread safe + if (device >= ggml_backend_cuda_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; static bool ggml_backend_cuda_buffer_type_initialized = false; @@ -10509,7 +10322,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { ggml_backend_cuda_buffer_types[i] = { /* .iface = */ ggml_backend_cuda_buffer_type_interface, - /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, }; } ggml_backend_cuda_buffer_type_initialized = true; @@ -10518,8 +10331,306 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { return &ggml_backend_cuda_buffer_types[device]; } +// cuda split buffer + +struct ggml_backend_cuda_split_buffer_context { + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < g_device_count; ++id) { + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + } + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(cudaFree(extra->data_device[id])); + } + } + delete extra; + } + } + + std::vector tensor_extras; +}; + +static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buffer); +} + +// unused at the moment +//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { +// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; +//} + +static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + UNUSED(buffer); +} + +static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + + ctx->tensor_extras.push_back(extra); + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); + char * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + + extra->data_device[id] = buf; + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } + } + tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->extra = extra; +} + +static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice)); + } +} + +static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost)); + } +} + +static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + UNUSED(buffer); + UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, +}; + +// cuda split buffer type + +static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + UNUSED(buft); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + UNUSED(buft); +} + +static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, + /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, +}; + +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + // FIXME: this is not thread safe + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = g_default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + // host buffer type +static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buft); +} + +static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buffer); +} + static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } @@ -10532,9 +10643,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; @@ -10543,6 +10654,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { + /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, @@ -10557,31 +10669,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend -struct ggml_backend_context_cuda { - int device; -}; - static const char * ggml_backend_cuda_name(ggml_backend_t backend) { - return GGML_CUDA_NAME; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - UNUSED(backend); + return cuda_ctx->name.c_str(); } static void ggml_backend_cuda_free(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10590,7 +10698,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens } static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10598,39 +10706,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; - - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); - - UNUSED(backend); -} - -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { - GGML_ASSERT(!"not implemented"); +static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - return nullptr; + if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0])); + return true; + } - UNUSED(backend); - UNUSED(cgraph); + return false; } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); + CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); UNUSED(backend); - UNUSED(plan); } static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10640,53 +10736,31 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; + } - assert(node->backend == GGML_BACKEND_GPU); +#ifndef NDEBUG + assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->src[j]->extra != nullptr); } } +#endif bool ok = ggml_cuda_compute_forward(¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); - -#if 0 - if (node->type == GGML_TYPE_F32) { - cudaDeviceSynchronize(); - std::vector tmp(ggml_nelements(node), 0.0f); - cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), - ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none", - node->src[0]->name, - node->src[1] ? node->src[1]->name : "none"); - double sum = 0.0; - double sq_sum = 0.0; - for (int i = 0; i < ggml_nelements(node); i++) { - printf("%f ", tmp[i]); - sum += tmp[i]; - sq_sum += tmp[i]*tmp[i]; - } - printf("\n"); - printf("sum: %f, ", sum); - printf("sq_sum: %f\n", sq_sum); - } -#endif } - UNUSED(backend); - return true; } @@ -10801,18 +10875,17 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten UNUSED(backend); } -static ggml_backend_i cuda_backend_i = { +static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, }; @@ -10828,12 +10901,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_cuda_set_main_device(device); - ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { - /* .device = */ device + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context { + /* .device = */ device, + /* .name = */ GGML_CUDA_NAME + std::to_string(device), }; ggml_backend_t cuda_backend = new ggml_backend { - /* .interface = */ cuda_backend_i, + /* .interface = */ ggml_backend_cuda_interface, /* .context = */ ctx }; @@ -10841,9 +10915,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cuda_name; + return backend && backend->iface.get_name == ggml_backend_cuda_name; +} + +int ggml_backend_cuda_get_device_count() { + return ggml_cuda_get_device_count(); +} + +void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { + ggml_cuda_get_device_description(device, description, description_size); +} + +void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { + ggml_cuda_set_device(device); + + CUDA_CHECK(cudaMemGetInfo(free, total)); } +// backend registry static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; diff --git a/ggml-cuda.h b/ggml-cuda.h index cdb0c0c41618a..d19cbf3fdd04b 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size); GGML_API void ggml_cuda_host_free(void * ptr); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); -GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); -GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); -GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_set_main_device(int main_device); -GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); -GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); -GGML_API void ggml_cuda_free_scratch(void); GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); GGML_API int ggml_cuda_get_device_count(void); @@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, GGML_API ggml_backend_t ggml_backend_cuda_init(int device); GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); -GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); - -// pinned host buffer for use with CPU backend for faster copies between CPU and GPU +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API int ggml_backend_cuda_get_device_count(void); +GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); + #ifdef __cplusplus } #endif diff --git a/ggml-impl.h b/ggml-impl.h index 2faced08059ed..2c58075ac7c56 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_HASHTABLE_FULL ((size_t)-1) #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set ggml_hash_set_new(size_t size); + bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted diff --git a/ggml-metal.m b/ggml-metal.m index 6e5594432b21a..c03624073fb61 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2520,10 +2520,10 @@ static void ggml_backend_metal_free_device(void) { } } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { - struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; +static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { + return "Metal"; - return ctx->all_data; + UNUSED(buffer); } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -2541,6 +2541,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; +} + static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); @@ -2553,14 +2559,12 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - - UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); +static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; UNUSED(buffer); } @@ -2572,18 +2576,25 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { + /* .get_name = */ ggml_backend_metal_buffer_get_name, /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor, /* .clear = */ ggml_backend_metal_buffer_clear, + /* .reset = */ NULL, }; // default buffer type +static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "Metal"; + + UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2656,6 +2667,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -2679,6 +2691,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t) data % size_page; + data = (void *) ((char *) data - offs); + size += offs; + } + size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); @@ -2779,14 +2799,13 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct UNUSED(backend); } -static struct ggml_backend_i metal_backend_i = { +static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, @@ -2805,7 +2824,7 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { - /* .interface = */ metal_backend_i, + /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; @@ -2813,7 +2832,7 @@ ggml_backend_t ggml_backend_metal_init(void) { } bool ggml_backend_is_metal(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_metal_name; + return backend && backend->iface.get_name == ggml_backend_metal_name; } void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 496f9cdca542d..2bb93638f1c7c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-opencl.h" +#include "ggml-backend-impl.h" #include #include @@ -10,7 +11,7 @@ #include #include -#define CL_TARGET_OPENCL_VERSION 110 +#define CL_TARGET_OPENCL_VERSION 120 #include #if defined(_MSC_VER) @@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co } void ggml_cl_init(void) { + static bool initialized = false; + if (initialized) { + return; + } + initialized = true; + cl_int err; struct cl_device; @@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + if (src1->backend == GGML_BACKEND_CPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + } CL_CHECK(clFinish(queue)); @@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + if (dst->backend == GGML_BACKEND_CPU) { + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + } } } } @@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr if (src0->backend != GGML_BACKEND_GPU) { ggml_cl_pool_free(d_X, x_size); } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + if (src1->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_Y, y_size); + } + if (dst->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_D, d_size); + } } static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { @@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); } + // FIXME: convert on device + for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // convert src1 to fp16 // TODO: use multiple threads @@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - ggml_fp16_to_fp32_row(tmp, d, d_ne); + if (dst->backend == GGML_BACKEND_CPU) { + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } else { + // FIXME: convert dst to fp32 on device + } } } } @@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; @@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { tensor->extra = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } + +// ggml-backend + +// buffer + +struct ggml_backend_opencl_buffer_context { + ~ggml_backend_opencl_buffer_context() { + if (buffer) { + clReleaseMemObject(buffer); + } + for (auto * sub_buffer : sub_buffers) { + clReleaseMemObject(sub_buffer); + } + } + + cl_mem buffer; + std::vector sub_buffers; +}; + +static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; + +static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) { + return "OpenCL"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { + return cl_ptr_base; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + tensor->extra = tensor->view_src->extra; + } else { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)}; + cl_int err; + cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + ctx->sub_buffers.push_back(sub_buffer); + tensor->extra = sub_buffer; + } + tensor->backend = GGML_BACKEND_GPU; +} + +static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); +} + +static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + clReleaseMemObject(sub_buffer); + } + ctx->sub_buffers.clear(); +} + +static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_get_name, + /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer, + /* .get_base = */ ggml_backend_opencl_buffer_get_base, + /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_opencl_buffer_clear, + /* .reset = */ ggml_backend_opencl_buffer_reset, +}; + +// buffer type + +static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) { + return "OpenCL"; + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_cl_init(); + + cl_int err; + cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); + return nullptr; + } + + ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}}; + + return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); +} + +static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { + // FIXME: not thread safe, device may not be initialized yet + static cl_uint alignment = -1; + if (alignment == (cl_uint)-1) { + ggml_cl_init(); + clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); + } + return alignment; + + GGML_UNUSED(buffer_type); +} + +static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) { + //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend + return ggml_backend_is_cpu(backend); + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, + /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + + +ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { + static ggml_backend_buffer_type buffer_type = { + /* .iface = */ ggml_backend_opencl_buffer_type_interface, + /* .context = */ nullptr, + }; + + return &buffer_type; +} + +#if 0 +// host buffer type + +static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CL_Host"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CL_Host"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cl_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cl_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_opencl_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_opencl_buffer_type_host; +} + +// backend + +static const char * ggml_backend_opencl_name(ggml_backend_t backend) { + return "OpenCL"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_opencl_free(ggml_backend_t backend) { + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_opencl_buffer_type(); + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { + for (int i = 0; i < graph->n_nodes; ++i) { + ggml_tensor * node = graph->nodes[i]; + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); + break; + case GGML_OP_MUL: + ggml_cl_mul(node->src[0], node->src[1], node); + break; + default: + GGML_ASSERT(false); + } + } + + return true; + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_can_mul_mat(op->src[0], op->src[1], op); + case GGML_OP_MUL: + // return ggml_can_repeat_rows(op->src[1], op->src[0]); + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +static ggml_backend_i opencl_backend_i = { + /* .get_name = */ ggml_backend_opencl_name, + /* .free = */ ggml_backend_opencl_free, + /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_opencl_graph_compute, + /* .supports_op = */ ggml_backend_opencl_supports_op, +}; + +ggml_backend_t ggml_backend_opencl_init() { + ggml_backend_t backend = new ggml_backend { + /* .interface = */ opencl_backend_i, + /* .context = */ nullptr + }; + + return backend; +} + +bool ggml_backend_is_opencl(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_opencl_name; +} +#endif diff --git a/ggml-opencl.h b/ggml-opencl.h index 44d05bd64a3ad..919b00d63a04e 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef __cplusplus extern "C" { @@ -9,17 +10,26 @@ extern "C" { GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -GGML_API void * ggml_cl_host_malloc(size_t size); -GGML_API void ggml_cl_host_free(void * ptr); +// GGML_API void * ggml_cl_host_malloc(size_t size); +// GGML_API void ggml_cl_host_free(void * ptr); GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +// backend API + +// GGML_API ggml_backend_t ggml_backend_opencl_init(void); + +// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); +// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); + #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index f5caeba082ea9..6dbd7626c9e23 100644 --- a/ggml.c +++ b/ggml.c @@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } void ggml_free(struct ggml_context * ctx) { + if (ctx == NULL) { + return; + } + // make this function thread safe ggml_critical_section_start(); @@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy( return ggml_cpy_impl(ctx, a, b); } +struct ggml_tensor * ggml_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_type type) { + bool is_node = false; + + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); + ggml_format_name(result, "%s (copy)", a->name); + + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = result; + + return result; +} + // ggml_cont static struct ggml_tensor * ggml_cont_impl( @@ -14871,7 +14892,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso return i; } -static struct ggml_hash_set ggml_hash_set_new(size_t size) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; @@ -16620,7 +16641,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return GGML_EXIT_SUCCESS; } -struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } @@ -16682,14 +16703,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { + cur = 0; const struct ggml_tensor * src0 = node->src[2]; const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } const int n_as = ggml_get_op_params_i32(node, 1); - cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; diff --git a/ggml.h b/ggml.h index 4c2ff6c661ea3..b18ba78120ca6 100644 --- a/ggml.h +++ b/ggml.h @@ -1165,6 +1165,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_type type); + // make contiguous GGML_API struct ggml_tensor * ggml_cont( struct ggml_context * ctx, @@ -1842,8 +1847,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data diff --git a/llama.cpp b/llama.cpp index ce413f605163c..fe1d8947c73a0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,5 +1,4 @@ #define LLAMA_API_INTERNAL -//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading #include "llama.h" #include "unicode.h" @@ -152,10 +151,6 @@ static bool is_float_close(float a, float b, float abs_tol) { return std::fabs(b - a) <= abs_tol; } -#ifdef GGML_USE_CPU_HBM -#include -#endif - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -1190,12 +1185,6 @@ struct llama_mlock { #endif }; -typedef void (*offload_func_t)(struct ggml_tensor * tensor); - -static void ggml_offload_nop(struct ggml_tensor * tensor) { - (void) tensor; -} - static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -1211,19 +1200,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } -static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { +static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - buft = ggml_backend_metal_buffer_type(); - } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (n_gpu_layers > 0) { - buft = ggml_backend_cuda_buffer_type(0); +#if defined(GGML_USE_CUBLAS) + // host buffers should only be used when data is expected to be copied to/from the GPU + if (host_buffer) { + buft = ggml_backend_cuda_host_buffer_type(); } -#elif defined(GGML_USE_CUBLAS) - buft = ggml_backend_cuda_host_buffer_type(); #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #endif @@ -1231,10 +1215,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { if (buft == nullptr) { buft = ggml_backend_cpu_buffer_type(); } + return buft; + + GGML_UNUSED(host_buffer); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#elif defined(GGML_USE_CUBLAS) + buft = ggml_backend_cuda_buffer_type(gpu); +#elif defined(GGML_USE_CLBLAST) + buft = ggml_backend_opencl_buffer_type(); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + + GGML_UNUSED(gpu); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_CUBLAS + if (ggml_backend_cuda_get_device_count() > 1) { + buft = ggml_backend_cuda_split_buffer_type(tensor_split); + } +#endif + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(fallback_gpu); + } return buft; - GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(tensor_split); } // @@ -1445,24 +1464,24 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - struct ggml_context * ctx = NULL; + std::vector ctxs; + std::vector bufs; - ggml_backend_buffer_t buf = NULL; + size_t total_size() const { + size_t size = 0; + for (ggml_backend_buffer_t buf : bufs) { + size += ggml_backend_buffer_get_size(buf); + } + return size; + } ~llama_kv_cache() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < k_l.size(); ++i) { - ggml_cuda_free_data(k_l[i]); - ggml_cuda_free_data(v_l[i]); - } - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; @@ -1539,16 +1558,32 @@ struct llama_model { std::vector layers; + llama_split_mode split_mode; + int main_gpu; int n_gpu_layers; // gguf metadata std::unordered_map gguf_kv; - // context - struct ggml_context * ctx = NULL; + // layer -> buffer type mapping + struct layer_buft { + layer_buft() : buft_matrix(nullptr), buft(nullptr) {} + layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {} + layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {} + + ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication + ggml_backend_buffer_type_t buft; // everything else + }; + + layer_buft buft_input; + layer_buft buft_output; + std::vector buft_layer; + + // contexts where the model tensors metadata is stored + std::vector ctxs; - // the model memory buffer - ggml_backend_buffer_t buf = NULL; + // the model memory buffers for the tensor data + std::vector bufs; // model memory mapped file std::unique_ptr mapping; @@ -1564,39 +1599,32 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cuda_free_data(tensors_by_name[i].second); - } - ggml_cuda_free_scratch(); - } -#endif - -#if defined(GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cl_free_data(tensors_by_name[i].second); - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { - ggml_allocr_free(alloc); - ggml_backend_buffer_free(buf_alloc); - ggml_backend_free(backend); + ggml_backend_sched_free(sched); + + for (ggml_backend_t backend : backends) { + ggml_backend_free(backend); + } } llama_cparams cparams; - ggml_backend_t backend = nullptr; + std::vector backends; +#ifdef GGML_USE_METAL + ggml_backend_t backend_metal = nullptr; +#endif + ggml_backend_t backend_cpu = nullptr; const llama_model & model; @@ -1630,8 +1658,9 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_buffer_t buf_alloc = NULL; - ggml_allocr * alloc = NULL; + ggml_backend_sched_t sched = nullptr; + // allocator for the input tensors + ggml_tallocr * alloc = nullptr; // temporary buffer for copying data to/from the backend std::vector> buf_copy; @@ -1646,16 +1675,17 @@ struct llama_context { // static bool llama_kv_cache_init( - const struct llama_hparams & hparams, struct llama_kv_cache & cache, + const llama_model & model, ggml_type ktype, ggml_type vtype, uint32_t n_ctx, - int n_gpu_layers, bool offload) { + const struct llama_hparams & hparams = model.hparams; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_layer = hparams.n_layer; + const int64_t n_layer = hparams.n_layer; cache.has_shift = false; @@ -1666,62 +1696,65 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - struct ggml_init_params params; - params.mem_size = 2u*n_layer*ggml_tensor_overhead(); - params.mem_buffer = NULL; - params.no_alloc = true; - - cache.ctx = ggml_init(params); +#ifdef GGML_USE_CLBLAST + offload = false; +#endif - size_t vram_kv_cache = 0; + // count used buffer types + std::map buft_layer_count; + if (offload) { + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + } + } else { + buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + } - if (!cache.ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); - return false; + // create a context for each buffer type + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__); + return false; + } + ctx_map[it.first] = ctx; + cache.ctxs.push_back(ctx); } cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; - for (int i = 0; i < (int) n_layer; i++) { - ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx); + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (i >= i_gpu_start) { - if (offload) { - ggml_cuda_assign_buffers_no_scratch(k); - ggml_cuda_assign_buffers_no_scratch(v); - vram_kv_cache += ggml_nbytes(k); - vram_kv_cache += ggml_nbytes(v); - // HACK: mark tensor as allocated - k->data = v->data = (void *)(uintptr_t)1; - } - } -#endif // GGML_USE_CUBLAS } - // allocate tensors - cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers)); - - // buf may be NULL with full offload - if (cache.buf) { - // initialize the buffer to avoid NaNs in the padding - ggml_backend_buffer_clear(cache.buf, 0); - } - - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - GGML_UNUSED(i_gpu_start); - GGML_UNUSED(offload); - return true; } @@ -2354,9 +2387,8 @@ struct llama_model_loader { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - tensor->backend = backend; // TODO: ggml_set_backend ggml_set_name(tensor, ggml_get_name(meta)); n_created++; @@ -2364,7 +2396,7 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend, bool required = true) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -2374,12 +2406,6 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } - if (backend == GGML_BACKEND_GPU_SPLIT) { - if (ne.size() == 1) { - throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); - } - } - { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2397,7 +2423,7 @@ struct llama_model_loader { } } - return create_tensor_for(ctx, cur, backend); + return create_tensor_for(ctx, cur); } void done_getting_tensors() const { @@ -2416,25 +2442,35 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping(bool prefetch = true) { - /* - // prefetch only CPU tensors + void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { + // prefetch the whole file - all the data is needed anyway if (use_mmap) { - size_t size_pref = 0; // prefetch + mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + } - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (cur->backend == GGML_BACKEND_CPU) { - size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); - size_pref = std::max(size_pref, tensor_end); - } + // compute the total size of all tensors for progress reporting + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + } + + if (use_mmap && mapping) { + if (lmlock) { + lmlock->init(mapping->addr); } - mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); + mmap_used_first = mapping->size; } - */ - // prefetch the whole file - all the data is needed anyway - if (use_mmap) { - mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + } + + void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { + GGML_ASSERT(mapping); + + *first = mapping->size; + *last = 0; + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const size_t offs = file_offset(ggml_get_name(tensor)); + *first = std::min(*first, offs); + *last = std::max(*last, offs + ggml_nbytes(tensor)); } } @@ -2443,8 +2479,11 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); if (use_mmap && mapping) { - GGML_ASSERT(cur->data == nullptr); - cur->data = (uint8_t *)mapping->addr + offs; + if (cur->data == nullptr) { + cur->data = (uint8_t *)mapping->addr + offs; + } else { + memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + } } else { GGML_ASSERT(cur->data != nullptr); file.seek(offs, SEEK_SET); @@ -2452,37 +2491,23 @@ struct llama_model_loader { } } - // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { - size_t size_data = 0; - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && buf_mmap) { - if (lmlock) { - lmlock->init(mapping->addr); - } - } + size_t size_done = 0; + size_t size_data = 0; + size_t mmap_used_first = -1; + size_t mmap_used_last = 0; -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) - const bool legacy_offload = true; -#else - const bool legacy_offload = false; -#endif + // Returns false if cancelled by progress_callback + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { + GGML_ASSERT(size_data != 0 && "call init_mapping() first"); std::vector> read_buf; - size_t size_done = 0; - - size_t mmap_first = -1; - size_t mmap_last = 0; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + if (!cur) { + // some tensors may be allocated in a different context + continue; + } if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { @@ -2492,67 +2517,48 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); - if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { - if (use_mmap && mapping) { - if (buf_mmap) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); - } - mmap_first = std::min(mmap_first, offs); - mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); - } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + if (use_mmap && mapping) { + if (buf_mmap && cur->data == nullptr) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + ggml_nbytes(cur)); } + mmap_used_first = std::min(mmap_used_first, offs); + mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); } else { - if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); - } else { - read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); - } + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { - // HACK: mark tensor as allocated - cur->data = (void *)(uintptr_t)1; - void * data; - if (use_mmap && mapping) { - data = (uint8_t *) mapping->addr + offs; + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); file.seek(offs, SEEK_SET); file.read_raw(read_buf.data(), ggml_nbytes(cur)); - data = read_buf.data(); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_transform_tensor(data, cur); -#elif defined(GGML_USE_CLBLAST) - GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); - ggml_cl_transform_tensor(data, cur); -#else - GGML_ASSERT(!"GPU tensor without a GPU backend"); - GGML_UNUSED(data); -#endif } size_done += ggml_nbytes(cur); } - // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_first); - mapping->unmap_fragment(mmap_last, mapping->size); + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_used_first); + if (mmap_used_last != 0) { + mapping->unmap_fragment(mmap_used_last, mapping->size); + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } } - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); - } return true; } }; @@ -3181,6 +3187,7 @@ static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, + enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, bool use_mlock, @@ -3188,702 +3195,563 @@ static bool llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - auto & ctx = model.ctx; auto & hparams = model.hparams; + model.split_mode = split_mode; + model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors; + const int64_t n_layer = hparams.n_layer; + const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); + + // there is very little benefit to offloading the input layer, so always keep it on the CPU + model.buft_input = llama_default_buffer_type_cpu(true); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + model.buft_layer.resize(n_layer); - // create the ggml context + // assign cpu layers + for (int64_t i = 0; i < i_gpu_start; ++i) { + model.buft_layer[i] = llama_default_buffer_type_cpu(true); + } + +#ifdef GGML_USE_CUBLAS + if (split_mode == LLAMA_SPLIT_LAYER) { + // calculate the split points + int device_count = ggml_backend_cuda_get_device_count(); + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); + float splits[GGML_CUDA_MAX_DEVICES]; + if (all_zero) { + // default split, by free memory + for (int i = 0; i < device_count; ++i) { + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(i, &total, &free); + splits[i] = free; + } + } else { + std::copy(tensor_split, tensor_split + device_count, splits); + } + + // sum and normalize the splits to get the split points + float split_sum = 0.0f; + for (int i = 0; i < device_count; ++i) { + split_sum += splits[i]; + splits[i] = split_sum; + } + for (int i = 0; i < device_count; ++i) { + splits[i] /= split_sum; + } + + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + } + // assign the output layer + if (n_gpu_layers > n_layer) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + model.buft_output = llama_default_buffer_type_offload(layer_gpu); + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } else +#endif { + ggml_backend_buffer_type_t split_buft; + if (split_mode == LLAMA_SPLIT_ROW) { + split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + } else { + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + split_buft = llama_default_buffer_type_offload(main_gpu); + } + // assign the repeating layers + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + model.buft_layer[i] = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } + // assign the output layer + if (n_gpu_layers > n_layer) { + model.buft_output = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } + + // count used buffer types + std::map buft_layer_count; + buft_layer_count[model.buft_input.buft]++; + buft_layer_count[model.buft_input.buft_matrix]++; + buft_layer_count[model.buft_output.buft]++; + buft_layer_count[model.buft_output.buft_matrix]++; + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } + + // create one context per buffer type + size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + std::map ctx_map; + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - throw std::runtime_error(format("ggml_init() failed")); + ggml_context * ctx = ggml_init(params); + if (!ctx) { + throw std::runtime_error(format("failed to create context")); } + ctx_map[it.first] = ctx; + model.ctxs.push_back(ctx); } - (void) main_gpu; - - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; - enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); - - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT; - } -#elif defined(GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU; -#endif + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0); // create tensors for the weights { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int64_t n_layer = hparams.n_layer; + const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = hparams.n_vocab; + const int64_t n_ff = hparams.n_ff; + + GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + + ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); + ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); + ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); + auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); }; + auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; + + model.layers.resize(n_layer); const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false); - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false); - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); if (layer.ffn_gate_inp == nullptr) { GGML_ASSERT(hparams.n_expert == 0); GGML_ASSERT(hparams.n_expert_used == 0); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { GGML_ASSERT(hparams.n_expert > 0); GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); - layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); - layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); + layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_FALCON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); - layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_STARCODER: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); - const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); - layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; case LLM_ARCH_BLOOM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_MPT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); // AWQ ScaleActivation layer - layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } } break; case LLM_ARCH_STABLELM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - /* - llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ] - */ - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_QWEN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - } + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - const uint32_t n_ff = hparams.n_ff / 2; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); } } break; case LLM_ARCH_PHI2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PLAMO: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_GPT2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; default: @@ -3893,78 +3761,51 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(); + ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); - // allocate tensors - size_t vram_weights = 0; - size_t buf_size = 0; + // create the backend buffers + std::vector> ctx_bufs; - ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); + for (auto & it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = nullptr; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend - if (t->backend == GGML_BACKEND_CPU) { - buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); - } else { - vram_weights += ggml_nbytes(t); + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size + if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); } - } - - // create backend buffer - ggml_backend_buffer_t buf_mmap = nullptr; - #ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - if (ml.use_mmap) { + else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { const size_t max_size = ggml_get_max_tensor_size(ctx); - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); - buf_mmap = model.buf; - } else { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); } - } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only - if (n_gpu_layers > 0) { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); - } #endif - - if (model.buf == nullptr) { - // CPU backend, and indirectly CUDA and OpenCL - if (ml.use_mmap) { - model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); - buf_mmap = model.buf; - } else { - // allocate only CPU tensors - model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - if (t->backend == GGML_BACKEND_CPU) { - ggml_tallocr_alloc(alloc, t); - } + else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_buf.init (ggml_backend_buffer_get_base(buf)); + model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf)); } - ggml_tallocr_free(alloc); } - } - - if (use_mlock && ggml_backend_buffer_is_host(model.buf)) { - model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); - model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); + if (buf == nullptr) { + throw std::runtime_error("failed to allocate buffer"); + } + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf); + ctx_bufs.emplace_back(ctx, buf); } // print memory requirements { - size_t sys_mem_required = ctx_size + buf_size; - - if (sys_mem_required > 0) { - LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); - } - if (vram_weights > 0) { - LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); - } - -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -3976,23 +3817,26 @@ static bool llm_load_tensors( const int max_offloadable_layers = hparams.n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); -#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_set_tensor_split(tensor_split); -#else - GGML_UNUSED(tensor_split); -#endif // GGML_USE_CUBLAS + for (ggml_backend_buffer_t buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + } + } // populate tensors_by_name - for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); - model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + for (ggml_context * ctx : model.ctxs) { + for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + } } - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { - return false; + // load tensor data + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + ggml_backend_buffer_t buf = it.second; + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } } model.mapping = std::move(ml.mapping); @@ -4026,13 +3870,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons } if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { return -2; } } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1; } @@ -4476,8 +4320,6 @@ struct llm_build_context { do_rope_shift (worst_case || kv_self.has_shift), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { - GGML_ASSERT(!!kv_self.ctx); - // all initializations should be done in init() } @@ -4557,6 +4399,12 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, @@ -6077,199 +5925,13 @@ struct llm_build_context { } }; -// -// tensor offloading helpers -// -// TODO: will be removed with backend v2 - -enum llm_offload_func_e { - OFFLOAD_FUNC_NOP, - OFFLOAD_FUNC, - OFFLOAD_FUNC_FRC, // force offload - OFFLOAD_FUNC_KQV, - OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, // embeddings - OFFLOAD_FUNC_OUT, -}; - -// TODO: will be removed with backend v2 -struct llm_offload_trie { - struct node { - ~node() { - for (int i = 0; i < 256; ++i) { - if (children[i]) { - delete children[i]; - } - } - } - - node * children[256] = { nullptr }; - llm_offload_func_e func = OFFLOAD_FUNC_NOP; - }; - - llm_offload_trie() { - root = new node; - } - - llm_offload_trie(const std::unordered_map & map) { - root = new node; - - for (const auto & kv : map) { - add(kv.first, kv.second); - } - } - - ~llm_offload_trie() { - delete root; - } - - void add(const char * name, llm_offload_func_e func) { - node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - cur->children[c] = new node; - } - - cur = cur->children[c]; - } - - cur->func = func; - } - - llm_offload_func_e find(const char * name) const { - const node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - return OFFLOAD_FUNC_NOP; - } - - cur = cur->children[c]; - } - - return cur->func; - } - - node * root = nullptr; -}; - -// TODO: will be removed with backend v2 -static const std::unordered_map k_offload_map = { - //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "pos_embd", OFFLOAD_FUNC_NR }, - - { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "KQ_mask", OFFLOAD_FUNC_FRC }, - { "K_shift", OFFLOAD_FUNC_FRC }, - - { "K_shifted", OFFLOAD_FUNC }, - - { "inp_norm", OFFLOAD_FUNC_NR }, - { "inp_norm_w", OFFLOAD_FUNC_NR }, - { "inp_norm_wb", OFFLOAD_FUNC_NR }, - - { "norm", OFFLOAD_FUNC }, - { "norm_w", OFFLOAD_FUNC }, - { "norm_wb", OFFLOAD_FUNC }, - - { "attn_norm", OFFLOAD_FUNC }, - { "attn_norm_2", OFFLOAD_FUNC }, - - { "wqkv", OFFLOAD_FUNC_KQV }, - { "bqkv", OFFLOAD_FUNC_KQV }, - { "wqkv_clamped", OFFLOAD_FUNC_KQV }, - - { "tmpk", OFFLOAD_FUNC_KQV }, - { "tmpq", OFFLOAD_FUNC_KQV }, - { "tmpv", OFFLOAD_FUNC_KQV }, - { "Kcur", OFFLOAD_FUNC_KQV }, - { "Qcur", OFFLOAD_FUNC_KQV }, - { "Vcur", OFFLOAD_FUNC_KQV }, - - { "krot", OFFLOAD_FUNC_KQV }, - { "qrot", OFFLOAD_FUNC_KQV }, - { "kpass", OFFLOAD_FUNC_KQV }, - { "qpass", OFFLOAD_FUNC_KQV }, - { "krotated", OFFLOAD_FUNC_KQV }, - { "qrotated", OFFLOAD_FUNC_KQV }, - - { "q", OFFLOAD_FUNC_KQV }, - { "k", OFFLOAD_FUNC_KQV }, - { "kq", OFFLOAD_FUNC_KQV }, - { "kq_scaled", OFFLOAD_FUNC_KQV }, - { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, - { "kq_masked", OFFLOAD_FUNC_KQV }, - { "kq_soft_max", OFFLOAD_FUNC_KQV }, - { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, - { "v", OFFLOAD_FUNC_KQV }, - { "kqv", OFFLOAD_FUNC_KQV }, - { "kqv_merged", OFFLOAD_FUNC_KQV }, - { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, - { "kqv_wo", OFFLOAD_FUNC_KQV }, - { "kqv_out", OFFLOAD_FUNC_KQV }, - - { "ffn_inp", OFFLOAD_FUNC }, - { "ffn_norm", OFFLOAD_FUNC }, - - { "ffn_up", OFFLOAD_FUNC }, - { "ffn_up_b", OFFLOAD_FUNC }, - { "ffn_gate", OFFLOAD_FUNC }, - { "ffn_gate_b", OFFLOAD_FUNC }, - { "ffn_gate_par", OFFLOAD_FUNC }, - { "ffn_act", OFFLOAD_FUNC }, - { "ffn_down", OFFLOAD_FUNC }, - { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_out", OFFLOAD_FUNC }, - - { "ffn_silu", OFFLOAD_FUNC }, - { "ffn_gelu", OFFLOAD_FUNC }, - { "ffn_relu", OFFLOAD_FUNC }, - { "ffn_sqr(relu)", OFFLOAD_FUNC }, - - { "ffn_moe_logits", OFFLOAD_FUNC }, - { "ffn_moe_probs", OFFLOAD_FUNC }, - { "ffn_moe_argsort", OFFLOAD_FUNC }, - { "ffn_moe_weights", OFFLOAD_FUNC }, - { "ffn_moe_weights_sum", OFFLOAD_FUNC }, - { "ffn_moe_weights_norm", OFFLOAD_FUNC }, - { "ffn_moe_weighted", OFFLOAD_FUNC }, - { "ffn_moe_up", OFFLOAD_FUNC }, - { "ffn_moe_gate", OFFLOAD_FUNC }, - { "ffn_moe_silu", OFFLOAD_FUNC }, - { "ffn_moe_gate_par", OFFLOAD_FUNC }, - { "ffn_moe_down", OFFLOAD_FUNC }, - { "ffn_moe_out", OFFLOAD_FUNC }, - - { "l_out", OFFLOAD_FUNC }, - - { "result_norm", OFFLOAD_FUNC_EMB }, - { "result_output_no_bias", OFFLOAD_FUNC_EMB }, - { "result_output", OFFLOAD_FUNC_OUT }, -}; - -static llm_offload_trie k_offload_func_trie(k_offload_map); - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); // keep track of the input that has already been allocated bool alloc_inp_tokens = false; @@ -6278,16 +5940,8 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif - - int n_non_view = 0; // number of non-view tensors that have been processed by the callback - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: will be removed with backend v2 + // TODO: improve handling of input and output tensors, then replace this with ggml_set_name llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -6298,12 +5952,11 @@ static struct ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // - // TODO: will be removed with backend v2 if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); @@ -6312,10 +5965,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_tokens = true; } - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6326,9 +5979,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; static_assert(std::is_same::value, "llama_pos must be int32_t"); @@ -6339,9 +5992,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6379,9 +6032,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; int32_t * data; @@ -6403,136 +6056,6 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - - // view tensors are not processed further - if (cur->view_src != nullptr) { - return; - } - - if (cur->op != GGML_OP_NONE) { - n_non_view++; - } - - // - // offload layers - // - // TODO: will be removed with backend v2 - -//#define LLAMA_OFFLOAD_DEBUG - - if (!do_offload) { - return; - } - - const int n_layer = model.hparams.n_layer; - - const int n_gpu_layers = model.n_gpu_layers; - const int i_gpu_start = n_layer - n_gpu_layers; - - // should we offload the final norm? yes if we are not computing embeddings - const bool offload_emb = lctx.embedding.empty(); - - static const std::unordered_map> k_offload_func_name = { - { OFFLOAD_FUNC_NOP, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - { OFFLOAD_FUNC, "GPU (CUDA)" }, - { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, - { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, - { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, - { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, -#else - { OFFLOAD_FUNC, "CPU" }, - { OFFLOAD_FUNC_FRC, "CPU" }, - { OFFLOAD_FUNC_KQV, "CPU" }, - { OFFLOAD_FUNC_NR, "CPU" }, - { OFFLOAD_FUNC_EMB, "CPU" }, -#endif // GGML_USE_CUBLAS - }; - - // check the global map for what offload function to use for this tensor - llm_offload_func_e func_e = k_offload_func_trie.find(name); - - if (func_e == OFFLOAD_FUNC_NOP) { -#ifdef LLAMA_OFFLOAD_DEBUG - // if a tensor hasn't been offloaded, we warn the user - if (worst_case) { - LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, - cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); - } -#endif - - return; - } - - // count the number of layers and respect the provided n_gpu_layers - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: - break; - case OFFLOAD_FUNC: - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - break; - case OFFLOAD_FUNC_FRC: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } break; - case OFFLOAD_FUNC_KQV: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } else { - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - } - break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_EMB: - if (!offload_emb || n_gpu_layers < n_layer) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - default: GGML_ASSERT(false); - } - - offload_func_t func = ggml_offload_nop; - - // this is needed for compatibility with Metal for example -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; -#else - static offload_func_t ggml_offload_gpu = ggml_offload_nop; -#endif - - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; - case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQV: - case OFFLOAD_FUNC_FRC: - case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; - default: GGML_ASSERT(false); - } - - // apply offload function to the tensor - func(cur); - -#ifdef LLAMA_OFFLOAD_DEBUG - if (worst_case) { - LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); - } -#endif }; struct ggml_cgraph * result = NULL; @@ -6600,27 +6123,6 @@ static struct ggml_cgraph * llama_build_graph( llm.free(); - if (worst_case) { - int n_non_view_total = 0; - - for (int i = 0; i < result->n_nodes; ++i) { - if (result->nodes[i]->view_src == nullptr) { - n_non_view_total++; - } - } - - LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); - - if (n_non_view != n_non_view_total) { - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); - LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); - LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); - LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - } - } - return result; } @@ -6666,8 +6168,6 @@ static int llama_decode_internal( auto & kv_self = lctx.kv_self; - GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -6721,12 +6221,10 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_allocr_reset(lctx.alloc); + ggml_backend_sched_reset(lctx.sched); ggml_cgraph * gf = llama_build_graph(lctx, batch); - ggml_allocr_alloc_graph(lctx.alloc, gf); - // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; GGML_ASSERT(strcmp(res->name, "result_output") == 0); @@ -6738,30 +6236,6 @@ static int llama_decode_internal( GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - ggml_cuda_copy_to_device(node); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - } - } - - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed - if (!lctx.embedding.empty()) { - embeddings->backend = GGML_BACKEND_CPU; - } - res->backend = GGML_BACKEND_CPU; -#endif - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread @@ -6784,15 +6258,17 @@ static int llama_decode_internal( #endif #ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend)) { - ggml_backend_metal_set_n_cb(lctx.backend, n_threads); + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); } #endif - if (ggml_backend_is_cpu(lctx.backend)) { - ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); } - ggml_backend_graph_compute(lctx.backend, gf); + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); #ifdef GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); @@ -6840,30 +6316,33 @@ static int llama_decode_internal( logits_out.clear(); #endif + ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); + GGML_ASSERT(res_backend != nullptr); if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { if (batch.logits[i] == 0) { continue; } - ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif } + ggml_backend_synchronize(res_backend); } // extract embeddings @@ -6871,7 +6350,9 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_synchronize(embeddings_backend); } // measure the performance only for the single-token evals @@ -9347,48 +8828,23 @@ static int llama_apply_lora_from_file_internal( LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - // create a name -> tensor map of the model to accelerate lookups - // find the max tensor size to estimate the required temporary buffer size - size_t max_tensor_size = 0; - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - size_t f32_size = ggml_nelements(kv.second) * sizeof(float); - max_tensor_size = std::max(max_tensor_size, f32_size); - } - - // create a temporary ggml context to store the lora tensors - // TODO: use ggml-alloc - size_t lora_ctx_size = max_tensor_size * 3; - LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); - std::vector lora_buf(lora_ctx_size); - - struct ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - using unique_context = std::unique_ptr; - - unique_context lora_ctx(nullptr, ggml_free); - lora_ctx.reset(ggml_init(params)); - std::unordered_map lora_tensors; - // load base model std::unique_ptr ml; - - if (path_base_model) { + if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(false); // no prefetching + ml->init_mapping(/*prefetch*/ false); // no prefetching } - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; + struct tensor_meta { + std::string name; + ggml_type type; + int32_t ne[2]; + size_t offset; + }; + std::map tensor_meta_map; + // load all tensor meta while (true) { if (fin.tell() == fin.size) { // eof @@ -9401,7 +8857,7 @@ static int llama_apply_lora_from_file_internal( fin.read_raw(&n_dims, sizeof(n_dims)); fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); + fin.read_raw(&ftype, sizeof(ftype)); if (n_dims != 1 && n_dims != 2) { LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); @@ -9415,31 +8871,23 @@ static int llama_apply_lora_from_file_internal( std::string name; { - GGML_ASSERT(name_len <= 1024); - char buf[1024]; + GGML_ASSERT(name_len < GGML_MAX_NAME); + char buf[GGML_MAX_NAME]; fin.read_raw(buf, name_len); name = std::string(buf, name_len); } - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; + // check for lora suffix + std::string lora_suffix; + if (name.length() > 6) { + lora_suffix = name.substr(name.length() - 6); } - - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { + LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); return 1; } - // create ggml tensor + // tensor type ggml_type wtype; switch (ftype) { case 0: wtype = GGML_TYPE_F32; break; @@ -9451,122 +8899,177 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); - ggml_set_name(lora_tensor, name.c_str()); - // load tensor data + // data offset size_t offset = fin.tell(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seek(offset, SEEK_SET); - fin.read_raw(lora_tensor->data, tensor_data_size); - lora_tensors[name] = lora_tensor; + // skip tensor data + fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); + + tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + } - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + bool warned = false; + int n_tensors = 0; - ggml_tensor * dest_t = model_tensors[base_name]; + // apply + ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); + return 1; + } + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); - offload_func_t offload_func = ggml_offload_nop; - offload_func_t offload_func_force_inplace = ggml_offload_nop; + std::vector> read_buf; + for (const auto & it : model.tensors_by_name) { + const std::string & base_name = it.first; + ggml_tensor * model_t = it.second; -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); - } - offload_func = ggml_cuda_assign_buffers; - offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; - } -#endif // GGML_USE_CUBLAS + if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || + tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { + continue; + } - ggml_tensor * base_t; - if (ml) { - struct gguf_context * ctx_gguf = ml->ctx_gguf; + tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); + tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - // load from base model - if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } + ggml_init_params lora_init_params = { + /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + ggml_context * lora_ctx = ggml_init(lora_init_params); + if (lora_ctx == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); + ggml_backend_free(backend_cpu); + return 1; + } - base_t = ml->get_tensor_meta(base_name.c_str()); - ml->load_data_for(base_t); - } else { - base_t = dest_t; - } + // create tensors + ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); + ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); + ggml_set_name(loraA, metaA.name.c_str()); + ggml_set_name(loraB, metaB.name.c_str()); - if (ggml_is_quantized(base_t->type)) { - if (!warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } + ggml_tensor * base_t; + if (ml) { + if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; } + base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); + } else { + base_t = ggml_dup_tensor(lora_ctx, model_t); + } + ggml_set_name(base_t, base_name.c_str()); - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - GGML_ASSERT(loraA->type == GGML_TYPE_F32); - ggml_set_name(loraA, "loraA"); + // allocate in backend buffer + ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (lora_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); + return 1; + } - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - GGML_ASSERT(loraB->type == GGML_TYPE_F32); - ggml_set_name(loraB, "loraB"); + // load tensor data + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { + read_buf.resize(ggml_nbytes(tensor)); + fin.seek(tensor_meta.offset, SEEK_SET); + fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); + }; + load_tensor(metaA, loraA); + load_tensor(metaB, loraB); - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - return 1; - } + // load base model tensor data + if (ml) { + ml->load_data_for(base_t); + } else { + ggml_backend_tensor_copy(model_t, base_t); + } + + if (ggml_is_quantized(base_t->type) && !warned) { + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } + auto build_lora_graph = [&]() { // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); - offload_func(BA); + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling); - offload_func(BA); + BA = ggml_scale(lora_ctx, BA, scaling); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; - if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); - offload_func_force_inplace(r); - ggml_set_name(r, "r_add_inplace"); - } - else { - r = ggml_add(lora_ctx.get(), base_t, BA); - offload_func(r); - ggml_set_name(r, "r_add"); + r = ggml_add_inplace(lora_ctx, base_t, BA); + ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx.get(), r, dest_t); - offload_func(r); - ggml_set_name(r, "r_cpy"); + if (base_t->type != model_t->type) { + // convert the result to the model type + r = ggml_cast(lora_ctx, r, model_t->type); + ggml_set_name(r, "r_cast"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); - ggml_build_forward_expand(gf, r); + return r; + }; + + ggml_cgraph * gf = ggml_new_graph(lora_ctx); + ggml_tensor * r = build_lora_graph(); + ggml_build_forward_expand(gf, r); - ggml_graph_compute_helper(work_buffer, gf, n_threads); + ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (graph_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } - // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other - GGML_ASSERT(lora_tensors.size() == 2); + ggml_backend_graph_compute(backend_cpu, gf); - // we won't need these tensors again, reset the context to save memory - lora_ctx.reset(ggml_init(params)); - lora_tensors.clear(); + ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); - } +#if 0 + // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU + //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); + + // sched compute + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_init_measure(sched, gf); + + // create the graph again, since the previous one was destroyed by the measure + ggml_graph_clear(gf); + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_graph_compute(sched, gf); + ggml_backend_sched_free(sched); +#endif + + ggml_backend_buffer_free(lora_buf); + ggml_backend_buffer_free(graph_buf); + ggml_free(lora_ctx); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_LOG_INFO("."); } } + ggml_backend_free(backend_cpu); + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); @@ -9579,6 +9082,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, + /*.split_mode =*/ LLAMA_SPLIT_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -9590,7 +9094,8 @@ struct llama_model_params llama_model_default_params() { }; #ifdef GGML_USE_METAL - result.n_gpu_layers = 1; + // note: we usually have plenty of VRAM, so by default offload all layers to the GPU + result.n_gpu_layers = 999; #endif return result; @@ -9780,41 +9285,53 @@ struct llama_context * llama_new_context_with_model( GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - // reserve memory for context buffers if (!hparams.vocab_only) { - // initialize backend + // initialize backends #ifdef GGML_USE_METAL if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_metal_init(); - if (ctx->backend == nullptr) { + ctx->backend_metal = ggml_backend_metal_init(); + if (ctx->backend_metal == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + llama_free(ctx); + return nullptr; } + ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only +#elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_cuda_init(0); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } } #endif - - if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { - ctx->backend = ggml_backend_cpu_init(); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - } - } - - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); - delete ctx; + ctx->backend_cpu = ggml_backend_cpu_init(); + if (ctx->backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + llama_free(ctx); return nullptr; } + ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, - cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, + cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9850,11 +9367,22 @@ struct llama_context * llama_new_context_with_model( } { - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + for (auto * backend : ctx->backends) { + if (ggml_backend_is_cpu(backend)) { + // use host buffers for the CPU backend compute buffer + backend_buft.push_back(llama_default_buffer_type_cpu(true)); + } else { + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } + } + + // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); - // create measure allocator - ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9862,50 +9390,19 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); - // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf); - - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0); - - // create allocator again with exact memory requirements - ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); - ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (model->n_gpu_layers > 0) { - // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets - ggml_cuda_set_scratch_size(alloc_size + 64); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - - // calculate total VRAM usage - auto add_tensor = [](const ggml_tensor * t, size_t & size) { - if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { - size += ggml_nbytes(t); - } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } - - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; + // initialize scheduler with the worst-case graph + ggml_backend_sched_init_measure(ctx->sched, gf); + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + for (ggml_backend_t backend : ctx->backends) { + ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(buf), + ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } -#endif } } @@ -10002,9 +9499,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 } int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "%s %s%s %s", + return snprintf(buf, buf_size, "%s %s %s", llama_model_arch_name(model->arch).c_str(), - model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -10026,7 +9522,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) { } struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - return ggml_get_tensor(model->ctx, name); + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; } uint32_t llama_model_quantize( @@ -10211,7 +9714,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf); + const size_t s_kv = ctx->kv_self.total_size(); const size_t s_total = ( + s_rng_size @@ -10340,7 +9843,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); + const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -10353,46 +9856,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - - std::vector kout2d(n_layer); - std::vector vout2d(n_layer); - - for (int il = 0; il < (int) n_layer; ++il) { - kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il])); - } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - ggml_backend_graph_compute(ctx->backend, gf); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(ggml_nbytes(kout2d[il])); - ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); - tmp_buf.resize(ggml_nbytes(vout2d[il])); - ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // v is not contiguous, copy row by row + tmp_buf.resize(elt_size*kv_head); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + } } - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } for (uint32_t i = 0; i < kv_size; ++i) { @@ -10491,48 +9967,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size); + GGML_ASSERT(kv_self.total_size() == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - - std::vector kin2d(n_layer); - std::vector vin2d(n_layer); - - for (int il = 0; il < n_layer; ++il) { - kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d)); - } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - // load data into the tensors - for (int il = 0; il < n_layer; ++il) { - ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il])); - inp += ggml_nbytes(kin2d[il]); - - ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il])); - inp += ggml_nbytes(vin2d[il]); + for (int il = 0; il < (int) n_layer; ++il) { + size_t k_size = elt_size*n_embd_k_gqa*kv_head; + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); + inp += k_size; + + // v is not contiguous, copy row by row + size_t v_row_size = elt_size*kv_head; + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + inp += v_row_size; + } } - - ggml_backend_graph_compute(ctx->backend, gf); - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } ctx->kv_self.head = kv_head; diff --git a/llama.h b/llama.h index 43d41b8f642b5..689e12d7ce092 100644 --- a/llama.h +++ b/llama.h @@ -118,6 +118,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_split_mode { + LLAMA_SPLIT_NONE = 0, // single GPU + LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_ROW = 2, // split rows across GPUs + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -180,8 +186,16 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + enum llama_split_mode split_mode; // how to split the model across multiple GPUs + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored + int32_t main_gpu; + + // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + const float * tensor_split; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7a60d77431e30..d9b8b106a6033 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -376,6 +376,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); + if (buf == NULL) { + printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); + ggml_free(ctx); + return false; + } // build graph ggml_build_forward_expand(gf, out); @@ -463,19 +468,23 @@ struct test_case { GGML_UNUSED(index); }; - ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); + const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); - if (ud.ok) { - printf("\033[1;32mOK\033[0m\n"); - } else { - printf("\033[1;31mFAIL\033[0m\n"); + if (!cmp_ok) { + printf("compare failed "); } ggml_backend_buffer_free(buf); ggml_free(ctx); - return ud.ok; + if (ud.ok && cmp_ok) { + printf("\033[1;32mOK\033[0m\n"); + return true; + } + + printf("\033[1;31mFAIL\033[0m\n"); + return false; } bool eval_perf(ggml_backend_t backend, const char * op_name) { @@ -519,6 +528,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (buf == NULL) { + printf("failed to allocate tensors\n"); + ggml_free(ctx); + return false; + } // randomize tensors initialize_tensors(ctx); From 3fe81781e3bf98b8e44946240a19f3a6ad08a11a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 12 Jan 2024 20:38:54 +0100 Subject: [PATCH 035/131] CUDA: faster q8_0 -> f16 dequantization (#4895) --- ggml-cuda.cu | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2db50437c0d65..bd3814c72b407 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -523,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16 #define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256 +#define CUDA_Q8_0_NE_ALIGN 2048 + // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -2327,6 +2329,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res y[i] = x[i]; } +template +static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) { +#if __CUDA_ARCH__ >= CC_PASCAL + constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; + + const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; + const int * x0 = ((int *) vx) + blockIdx.x * nint; + half2 * y2 = (half2 *) (y + i0); + + __shared__ int vals[nint]; + +#pragma unroll + for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) { + if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) { + break; + } + + const int ix = ix0 + threadIdx.x; + vals[ix] = x0[ix]; + } + +#pragma unroll + for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) { + if (need_check && i0 + iy + 2*threadIdx.x >= k) { + return; + } + + const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0); + const half d = *b0; + const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)]; + + y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d)); + } +#else + (void) vx; (void) y; (void) k; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_PASCAL +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q @@ -6181,6 +6222,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri dequantize_block<<>>(vx, y, k); } +static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN; + if (k % CUDA_Q8_0_NE_ALIGN == 0) { + const bool need_check = false; + dequantize_block_q8_0_f16<<>>(vx, y, k); + } else { + const bool need_check = true; + dequantize_block_q8_0_f16<<>>(vx, y, k); + } +} + template static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; @@ -6246,6 +6298,7 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_ } static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { + int id; switch (type) { case GGML_TYPE_Q4_0: return dequantize_block_cuda; @@ -6256,6 +6309,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q5_1: return dequantize_block_cuda; case GGML_TYPE_Q8_0: + CUDA_CHECK(cudaGetDevice(&id)); + if (g_device_caps[id].cc >= CC_PASCAL) { + return dequantize_block_q8_0_f16_cuda; + } return dequantize_block_cuda; case GGML_TYPE_Q2_K: return dequantize_row_q2_K_cuda; From 52ee4540c0f2e11d52c839db6eb51d014ce060e1 Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Fri, 12 Jan 2024 20:46:45 +0100 Subject: [PATCH 036/131] examples : add pydantic models to GBNF grammar generator (#4883) * Create pydantic-models-to-grammar.py * Added some comments for usage * Refactored Grammar Generator Added example and usage instruction. * Update pydantic_models_to_grammar.py * Update pydantic-models-to-grammar-examples.py * Renamed module and imported it. * Update pydantic-models-to-grammar.py * Renamed file and fixed grammar generator issue. --- .../pydantic-models-to-grammar-examples.py | 136 ++ examples/pydantic_models_to_grammar.py | 1151 +++++++++++++++++ 2 files changed, 1287 insertions(+) create mode 100644 examples/pydantic-models-to-grammar-examples.py create mode 100644 examples/pydantic_models_to_grammar.py diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py new file mode 100644 index 0000000000000..a8a4919cff243 --- /dev/null +++ b/examples/pydantic-models-to-grammar-examples.py @@ -0,0 +1,136 @@ +# Function calling example using pydantic models. + +import json +from enum import Enum +from typing import Union, Optional + +import requests +from pydantic import BaseModel, Field + +import importlib +from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation + +# Function to get completion on the llama.cpp server with grammar. +def create_completion(prompt, grammar): + headers = {"Content-Type": "application/json"} + data = {"prompt": prompt, "grammar": grammar} + + response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) + data = response.json() + + print(data["content"]) + return data["content"] + + +# A function for the agent to send a message to the user. +class SendMessageToUser(BaseModel): + """ + Send a message to the User. + """ + chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") + message: str = Field(..., description="Message you want to send to the user.") + + def run(self): + print(self.message) + + +# Enum for the calculator function. +class MathOperation(Enum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" + + +# Very simple calculator tool for the agent. +class Calculator(BaseModel): + """ + Perform a math operation on two numbers. + """ + number_one: Union[int, float] = Field(..., description="First number.") + operation: MathOperation = Field(..., description="Math operation to perform.") + number_two: Union[int, float] = Field(..., description="Second number.") + + def run(self): + if self.operation == MathOperation.ADD: + return self.number_one + self.number_two + elif self.operation == MathOperation.SUBTRACT: + return self.number_one - self.number_two + elif self.operation == MathOperation.MULTIPLY: + return self.number_one * self.number_two + elif self.operation == MathOperation.DIVIDE: + return self.number_one / self.number_two + else: + raise ValueError("Unknown operation.") + + +# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. +# pydantic_model_list is the list of pydanitc models +# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated +# outer_object_content is the name of outer object content. +# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") +# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") +gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + +print(gbnf_grammar) +print(documentation) + +system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + +user_message = "What is 42 * 42?" +prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + +text = create_completion(prompt=prompt, grammar=gbnf_grammar) +# This should output something like this: +# { +# "function": "calculator", +# "function_parameters": { +# "number_one": 42, +# "operation": "multiply", +# "number_two": 42 +# } +# } +function_dictionary = json.loads(text) +if function_dictionary["function"] == "calculator": + function_parameters = {**function_dictionary["function_parameters"]} + + print(Calculator(**function_parameters).run()) + # This should output: 1764 + + +# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. +class Category(Enum): + """ + The category of the book. + """ + Fiction = "Fiction" + NonFiction = "Non-Fiction" + + +class Book(BaseModel): + """ + Represents an entry about a book. + """ + title: str = Field(..., description="Title of the book.") + author: str = Field(..., description="Author of the book.") + published_year: Optional[int] = Field(..., description="Publishing year of the book.") + keywords: list[str] = Field(..., description="A list of keywords.") + category: Category = Field(..., description="Category of the book.") + summary: str = Field(..., description="Summary of the book.") + + +# We need no additional parameters other than our list of pydantic models. +gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) + +system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + +text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" +prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + +text = create_completion(prompt=prompt, grammar=gbnf_grammar) + +json_data = json.loads(text) + +print(Book(**json_data)) diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py new file mode 100644 index 0000000000000..41b98fdc1fcb4 --- /dev/null +++ b/examples/pydantic_models_to_grammar.py @@ -0,0 +1,1151 @@ +import inspect +import json +from copy import copy +from inspect import isclass, getdoc +from types import NoneType + +from pydantic import BaseModel, create_model, Field +from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias +from enum import Enum +from typing import get_type_hints, Callable +import re + + +class PydanticDataType(Enum): + """ + Defines the data types supported by the grammar_generator. + + Attributes: + STRING (str): Represents a string data type. + BOOLEAN (str): Represents a boolean data type. + INTEGER (str): Represents an integer data type. + FLOAT (str): Represents a float data type. + OBJECT (str): Represents an object data type. + ARRAY (str): Represents an array data type. + ENUM (str): Represents an enum data type. + CUSTOM_CLASS (str): Represents a custom class data type. + """ + STRING = "string" + TRIPLE_QUOTED_STRING = "triple_quoted_string" + MARKDOWN_STRING = "markdown_string" + BOOLEAN = "boolean" + INTEGER = "integer" + FLOAT = "float" + OBJECT = "object" + ARRAY = "array" + ENUM = "enum" + ANY = "any" + NULL = "null" + CUSTOM_CLASS = "custom-class" + CUSTOM_DICT = "custom-dict" + SET = "set" + + +def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: + if isclass(pydantic_type) and issubclass(pydantic_type, str): + return PydanticDataType.STRING.value + elif isclass(pydantic_type) and issubclass(pydantic_type, bool): + return PydanticDataType.BOOLEAN.value + elif isclass(pydantic_type) and issubclass(pydantic_type, int): + return PydanticDataType.INTEGER.value + elif isclass(pydantic_type) and issubclass(pydantic_type, float): + return PydanticDataType.FLOAT.value + elif isclass(pydantic_type) and issubclass(pydantic_type, Enum): + return PydanticDataType.ENUM.value + + elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): + return format_model_and_field_name(pydantic_type.__name__) + elif get_origin(pydantic_type) == list: + element_type = get_args(pydantic_type)[0] + return f"{map_pydantic_type_to_gbnf(element_type)}-list" + elif get_origin(pydantic_type) == set: + element_type = get_args(pydantic_type)[0] + return f"{map_pydantic_type_to_gbnf(element_type)}-set" + elif get_origin(pydantic_type) == Union: + union_types = get_args(pydantic_type) + union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] + return f"union-{'-or-'.join(union_rules)}" + elif get_origin(pydantic_type) == Optional: + element_type = get_args(pydantic_type)[0] + return f"optional-{map_pydantic_type_to_gbnf(element_type)}" + elif isclass(pydantic_type): + return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" + elif get_origin(pydantic_type) == dict: + key_type, value_type = get_args(pydantic_type) + return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" + else: + return "unknown" + + +def format_model_and_field_name(model_name: str) -> str: + parts = re.findall('[A-Z][^A-Z]*', model_name) + if not parts: # Check if the list is empty + return model_name.lower().replace("_", "-") + return '-'.join(part.lower().replace("_", "-") for part in parts) + + +def generate_list_rule(element_type): + """ + Generate a GBNF rule for a list of a given element type. + + :param element_type: The type of the elements in the list (e.g., 'string'). + :return: A string representing the GBNF rule for a list of the given type. + """ + rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list" + element_rule = map_pydantic_type_to_gbnf(element_type) + list_rule = fr'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' + return list_rule + + +def get_members_structure(cls, rule_name): + if issubclass(cls, Enum): + # Handle Enum types + members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()] + return f"{cls.__name__.lower()} ::= " + " | ".join(members) + if cls.__annotations__ and cls.__annotations__ != {}: + result = f'{rule_name} ::= "{{"' + type_list_rules = [] + # Modify this comprehension + members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param_type)}' + for name, param_type in cls.__annotations__.items() + if name != 'self'] + + result += '"," '.join(members) + result += ' "}"' + return result, type_list_rules + elif rule_name == "custom-class-any": + result = f'{rule_name} ::= ' + result += 'value' + type_list_rules = [] + return result, type_list_rules + else: + init_signature = inspect.signature(cls.__init__) + parameters = init_signature.parameters + result = f'{rule_name} ::= "{{"' + type_list_rules = [] + # Modify this comprehension too + members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != 'self' and param.annotation != inspect.Parameter.empty] + + result += '", "'.join(members) + result += ' "}"' + return result, type_list_rules + + +def regex_to_gbnf(regex_pattern: str) -> str: + """ + Translate a basic regex pattern to a GBNF rule. + Note: This function handles only a subset of simple regex patterns. + """ + gbnf_rule = regex_pattern + + # Translate common regex components to GBNF + gbnf_rule = gbnf_rule.replace('\\d', '[0-9]') + gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]') + + # Handle quantifiers and other regex syntax that is similar in GBNF + # (e.g., '*', '+', '?', character classes) + + return gbnf_rule + + +def generate_gbnf_integer_rules(max_digit=None, min_digit=None): + """ + + Generate GBNF Integer Rules + + Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits. + + Parameters: + max_digit (int): The maximum number of digits for the integer. Default is None. + min_digit (int): The minimum number of digits for the integer. Default is None. + + Returns: + integer_rule (str): The identifier for the integer rule generated. + additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. + + """ + additional_rules = [] + + # Define the rule identifier based on max_digit and min_digit + integer_rule = "integer-part" + if max_digit is not None: + integer_rule += f"-max{max_digit}" + if min_digit is not None: + integer_rule += f"-min{min_digit}" + + # Handling Integer Rules + if max_digit is not None or min_digit is not None: + # Start with an empty rule part + integer_rule_part = '' + + # Add mandatory digits as per min_digit + if min_digit is not None: + integer_rule_part += '[0-9] ' * min_digit + + # Add optional digits up to max_digit + if max_digit is not None: + optional_digits = max_digit - (min_digit if min_digit is not None else 0) + integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)]) + + # Trim the rule part and append it to additional rules + integer_rule_part = integer_rule_part.strip() + if integer_rule_part: + additional_rules.append(f'{integer_rule} ::= {integer_rule_part}') + + return integer_rule, additional_rules + + +def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None, min_precision=None): + """ + Generate GBNF float rules based on the given constraints. + + :param max_digit: Maximum number of digits in the integer part (default: None) + :param min_digit: Minimum number of digits in the integer part (default: None) + :param max_precision: Maximum number of digits in the fractional part (default: None) + :param min_precision: Minimum number of digits in the fractional part (default: None) + :return: A tuple containing the float rule and additional rules as a list + + Example Usage: + max_digit = 3 + min_digit = 1 + max_precision = 2 + min_precision = 1 + generate_gbnf_float_rules(max_digit, min_digit, max_precision, min_precision) + + Output: + ('float-3-1-2-1', ['integer-part-max3-min1 ::= [0-9] [0-9] [0-9]?', 'fractional-part-max2-min1 ::= [0-9] [0-9]?', 'float-3-1-2-1 ::= integer-part-max3-min1 "." fractional-part-max2-min + *1']) + + Note: + GBNF stands for Generalized Backus-Naur Form, which is a notation technique to specify the syntax of programming languages or other formal grammars. + """ + additional_rules = [] + + # Define the integer part rule + integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( + f"-min{min_digit}" if min_digit is not None else "") + + # Define the fractional part rule based on precision constraints + fractional_part_rule = "fractional-part" + fractional_rule_part = '' + if max_precision is not None or min_precision is not None: + fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + ( + f"-min{min_precision}" if min_precision is not None else "") + # Minimum number of digits + fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1) + # Optional additional digits + fractional_rule_part += ''.join([' [0-9]?'] * ( + (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0)) + additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}') + + # Define the float rule + float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}" + additional_rules.append(f'{float_rule} ::= {integer_part_rule} "." {fractional_part_rule}') + + # Generating the integer part rule definition, if necessary + if max_digit is not None or min_digit is not None: + integer_rule_part = '[0-9]' + if min_digit is not None and min_digit > 1: + integer_rule_part += ' [0-9]' * (min_digit - 1) + if max_digit is not None: + integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1))) + additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}') + + return float_rule, additional_rules + + +def generate_gbnf_rule_for_type(model_name, field_name, + field_type, is_optional, processed_models, created_rules, + field_info=None) -> \ + Tuple[str, list]: + """ + Generate GBNF rule for a given field type. + + :param model_name: Name of the model. + + :param field_name: Name of the field. + :param field_type: Type of the field. + :param is_optional: Whether the field is optional. + :param processed_models: List of processed models. + :param created_rules: List of created rules. + :param field_info: Additional information about the field (optional). + + :return: Tuple containing the GBNF type and a list of additional rules. + :rtype: Tuple[str, list] + """ + rules = [] + + field_name = format_model_and_field_name(field_name) + gbnf_type = map_pydantic_type_to_gbnf(field_type) + + if isclass(field_type) and issubclass(field_type, BaseModel): + nested_model_name = format_model_and_field_name(field_type.__name__) + nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules) + rules.extend(nested_model_rules) + gbnf_type, rules = nested_model_name, rules + elif isclass(field_type) and issubclass(field_type, Enum): + enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type] # Adding escaped quotes + enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}" + rules.append(enum_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + elif get_origin(field_type) == list or field_type == list: # Array + element_type = get_args(field_type)[0] + element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-element", + element_type, is_optional, processed_models, + created_rules) + rules.extend(additional_rules) + array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ + rules.append(array_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + + elif get_origin(field_type) == set or field_type == set: # Array + element_type = get_args(field_type)[0] + element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-element", + element_type, is_optional, processed_models, + created_rules) + rules.extend(additional_rules) + array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ + rules.append(array_rule) + gbnf_type, rules = model_name + "-" + field_name, rules + + elif gbnf_type.startswith("custom-class-"): + nested_model_rules, field_types = get_members_structure(field_type, gbnf_type) + rules.append(nested_model_rules) + elif gbnf_type.startswith("custom-dict-"): + key_type, value_type = get_args(field_type) + + additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-key-type", + key_type, is_optional, processed_models, + created_rules) + additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name, + f"{field_name}-value-type", + value_type, is_optional, + processed_models, created_rules) + gbnf_type = fr'{gbnf_type} ::= "{{" ( {additional_key_type} ":" {additional_value_type} ("," {additional_key_type} ":" {additional_value_type})* )? "}}" ' + + rules.extend(additional_key_rules) + rules.extend(additional_value_rules) + elif gbnf_type.startswith("union-"): + union_types = get_args(field_type) + union_rules = [] + + for union_type in union_types: + if isinstance(union_type, _GenericAlias): + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, + field_name, union_type, + False, + processed_models, created_rules) + union_rules.append(union_gbnf_type) + rules.extend(union_rules_list) + + + elif not issubclass(union_type, NoneType): + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, + field_name, union_type, + False, + processed_models, created_rules) + union_rules.append(union_gbnf_type) + rules.extend(union_rules_list) + + # Defining the union grammar rule separately + if len(union_rules) == 1: + union_grammar_rule = f"{model_name}-{field_name}-optional ::= {' | '.join(union_rules)} | null" + else: + union_grammar_rule = f"{model_name}-{field_name}-union ::= {' | '.join(union_rules)}" + rules.append(union_grammar_rule) + if len(union_rules) == 1: + gbnf_type = f"{model_name}-{field_name}-optional" + else: + gbnf_type = f"{model_name}-{field_name}-union" + elif isclass(field_type) and issubclass(field_type, str): + if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None: + + triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False) + markdown_string = field_info.json_schema_extra.get('markdown_string', False) + + gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value + gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type + + elif field_info and hasattr(field_info, 'pattern'): + # Convert regex pattern to grammar rule + regex_pattern = field_info.regex.pattern + gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}" + else: + gbnf_type = PydanticDataType.STRING.value + + elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info, + 'json_schema_extra') and field_info.json_schema_extra is not None: + # Retrieve precision attributes for floats + max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info, + 'json_schema_extra') else None + max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + + # Generate GBNF rule for float with given attributes + gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits, + max_precision=max_precision, + min_precision=min_precision) + + elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info, + 'json_schema_extra') and field_info.json_schema_extra is not None: + # Retrieve digit attributes for integers + max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, + 'json_schema_extra') else None + + # Generate GBNF rule for integer with given attributes + gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits) + else: + gbnf_type, rules = gbnf_type, [] + + if gbnf_type not in created_rules: + return gbnf_type, rules + else: + if gbnf_type in created_rules: + return gbnf_type, rules + + +def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool): + """ + + Generate GBnF Grammar + + Generates a GBnF grammar for a given model. + + :param model: A Pydantic model class to generate the grammar for. Must be a subclass of BaseModel. + :param processed_models: A set of already processed models to prevent infinite recursion. + :param created_rules: A dict containing already created rules to prevent duplicates. + :return: A list of GBnF grammar rules in string format. And two booleans indicating if an extra markdown or triple quoted string is in the grammar. + Example Usage: + ``` + model = MyModel + processed_models = set() + created_rules = dict() + + gbnf_grammar = generate_gbnf_grammar(model, processed_models, created_rules) + ``` + """ + if model in processed_models: + return [] + + processed_models.add(model) + model_name = format_model_and_field_name(model.__name__) + + if not issubclass(model, BaseModel): + # For non-Pydantic classes, generate model_fields from __annotations__ or __init__ + if hasattr(model, '__annotations__') and model.__annotations__: + model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} + else: + init_signature = inspect.signature(model.__init__) + parameters = init_signature.parameters + model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() + if name != 'self'} + else: + # For Pydantic models, use model_fields and check for ellipsis (required fields) + model_fields = model.__annotations__ + + model_rule_parts = [] + nested_rules = [] + has_markdown_code_block = False + has_triple_quoted_string = False + look_for_markdown_code_block = False + look_for_triple_quoted_string = False + for field_name, field_info in model_fields.items(): + if not issubclass(model, BaseModel): + field_type, default_value = field_info + # Check if the field is optional (not required) + is_optional = (default_value is not inspect.Parameter.empty) and (default_value is not Ellipsis) + else: + field_type = field_info + field_info = model.model_fields[field_name] + is_optional = field_info.is_required is False and get_origin(field_type) is Optional + rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, + format_model_and_field_name(field_name), + field_type, is_optional, + processed_models, created_rules, field_info) + look_for_markdown_code_block = True if rule_name == "markdown_string" else False + look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False + if not look_for_markdown_code_block and not look_for_triple_quoted_string: + if rule_name not in created_rules: + created_rules[rule_name] = additional_rules + model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": " {rule_name}') # Adding escaped quotes + nested_rules.extend(additional_rules) + else: + has_triple_quoted_string = look_for_markdown_code_block + has_markdown_code_block = look_for_triple_quoted_string + + fields_joined = r' "," "\n" '.join(model_rule_parts) + model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' + + if look_for_markdown_code_block or look_for_triple_quoted_string: + model_rule += ' ws "}"' + + if has_triple_quoted_string: + model_rule += '"\\n" triple-quoted-string' + if has_markdown_code_block: + model_rule += '"\\n" markdown-code-block' + all_rules = [model_rule] + nested_rules + + return all_rules, has_markdown_code_block, has_triple_quoted_string + + +def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None, + outer_object_content: str = None, list_of_outputs: bool = False) -> str: + """ + Generate GBNF Grammar from Pydantic Models. + + This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated + * grammar. + + Parameters: + models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + list_of_outputs (str, optional): Allows a list of output objects + Returns: + str: The generated GBNF grammar string. + + Examples: + models = [UserModel, PostModel] + grammar = generate_gbnf_grammar_from_pydantic(models) + print(grammar) + # Output: + # root ::= UserModel | PostModel + # ... + """ + processed_models = set() + all_rules = [] + created_rules = {} + if outer_object_name is None: + + for model in models: + model_rules, _, _ = generate_gbnf_grammar(model, + processed_models, created_rules) + all_rules.extend(model_rules) + + if list_of_outputs: + root_rule = r'root ::= ws "[" grammar-models ("," grammar-models)* "]"' + "\n" + else: + root_rule = r'root ::= ws grammar-models' + "\n" + root_rule += "grammar-models ::= " + " | ".join( + [format_model_and_field_name(model.__name__) for model in models]) + all_rules.insert(0, root_rule) + return "\n".join(all_rules) + elif outer_object_name is not None: + if list_of_outputs: + root_rule = fr'root ::= ws "[" {format_model_and_field_name(outer_object_name)} ("," {format_model_and_field_name(outer_object_name)})* "]"' + "\n" + else: + root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n" + + model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\"" ": " grammar-models' + + fields_joined = " | ".join( + [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models]) + + grammar_model_rules = f'\ngrammar-models ::= {fields_joined}' + mod_rules = [] + for model in models: + mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws' + mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n' + mod_rules.append(mod_rule) + grammar_model_rules += "\n" + "\n".join(mod_rules) + look_for_markdown_code_block = False + look_for_triple_quoted_string = False + for model in models: + model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model, + processed_models, created_rules) + all_rules.extend(model_rules) + if markdown_block: + look_for_markdown_code_block = True + + if triple_quoted_string: + look_for_triple_quoted_string = True + + if not look_for_markdown_code_block and not look_for_triple_quoted_string: + model_rule += ' ws "}"' + all_rules.insert(0, root_rule + model_rule + grammar_model_rules) + return "\n".join(all_rules) + + +def get_primitive_grammar(grammar): + """ + Returns the needed GBNF primitive grammar for a given GBNF grammar string. + + Args: + grammar (str): The string containing the GBNF grammar. + + Returns: + str: GBNF primitive grammar string. + """ + type_list = [] + if "string-list" in grammar: + type_list.append(str) + if "boolean-list" in grammar: + type_list.append(bool) + if "integer-list" in grammar: + type_list.append(int) + if "float-list" in grammar: + type_list.append(float) + additional_grammar = [generate_list_rule(t) for t in type_list] + primitive_grammar = r""" +boolean ::= "true" | "false" +null ::= "null" +string ::= "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" ws +ws ::= ([ \t\n] ws)? +float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +integer ::= [0-9]+""" + + any_block = "" + if "custom-class-any" in grammar: + any_block = ''' +value ::= object | array | string | number | boolean | null + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +number ::= integer | float''' + + markdown_code_block_grammar = "" + if "markdown-code-block" in grammar: + markdown_code_block_grammar = r''' +markdown-code-block ::= opening-triple-ticks markdown-code-block-content closing-triple-ticks +markdown-code-block-content ::= ( [^`] | "`" [^`] | "`" "`" [^`] )* +opening-triple-ticks ::= "```" "python" "\n" | "```" "c" "\n" | "```" "cpp" "\n" | "```" "txt" "\n" | "```" "text" "\n" | "```" "json" "\n" | "```" "javascript" "\n" | "```" "css" "\n" | "```" "html" "\n" | "```" "markdown" "\n" +closing-triple-ticks ::= "```" "\n"''' + + if "triple-quoted-string" in grammar: + markdown_code_block_grammar = r""" +triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes +triple-quoted-string-content ::= ( [^'] | "'" [^'] | "'" "'" [^'] )* +triple-quotes ::= "'''" """ + return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar + + +def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str: + indent = ' ' * depth + field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): " + + # Extracting field description from Pydantic Field using __model_fields__ + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "No description available." + + field_markdown += field_description + '\n' + + # Handling nested BaseModel fields + if isclass(field_type) and issubclass(field_type, BaseModel): + field_markdown += f"{indent} - Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_markdown += generate_field_markdown(name, type_, field_type, depth + 2) + + return field_markdown + + +def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str: + markdown = "" + for model in pydantic_models: + markdown += f"### {format_model_and_field_name(model.__name__)}\n" + + # Check if the model's docstring is different from BaseModel's docstring + class_doc = getdoc(model) + base_class_doc = getdoc(BaseModel) + class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available." + + markdown += f"{class_description}\n\n" + markdown += "#### Fields\n" + + if isclass(model) and issubclass(model, BaseModel): + for name, field_type in model.__annotations__.items(): + markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model) + markdown += "\n" + + return markdown + + +def format_json_example(example: dict, depth: int) -> str: + """ + Format a JSON example into a readable string with indentation. + + Args: + example (dict): JSON example to be formatted. + depth (int): Indentation depth. + + Returns: + str: Formatted JSON example string. + """ + indent = ' ' * depth + formatted_example = '{\n' + for key, value in example.items(): + value_text = f"'{value}'" if isinstance(value, str) else value + formatted_example += f"{indent}{key}: {value_text},\n" + formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}' + return formatted_example + + +def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model", + fields_prefix="Fields", documentation_with_field_description=True) -> str: + """ + Generate text documentation for a list of Pydantic models. + + Args: + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation. + """ + documentation = "" + pyd_models = [(model, True) for model in pydantic_models] + for model, add_prefix in pyd_models: + if add_prefix: + documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n" + else: + documentation += f"Model: {format_model_and_field_name(model.__name__)}\n" + + # Handling multi-line model description with proper indentation + + class_doc = getdoc(model) + base_class_doc = getdoc(BaseModel) + class_description = class_doc if class_doc and class_doc != base_class_doc else "" + if class_description != "": + documentation += " Description: " + documentation += "\n" + format_multiline_description(class_description, 2) + "\n" + + if add_prefix: + # Indenting the fields section + documentation += f" {fields_prefix}:\n" + else: + documentation += f" Fields:\n" + if isclass(model) and issubclass(model, BaseModel): + for name, field_type in model.__annotations__.items(): + # if name == "markdown_code_block": + # continue + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + if get_origin(field_type) == Union: + element_types = get_args(field_type) + for element_type in element_types: + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + documentation += generate_field_text(name, field_type, model, + documentation_with_field_description=documentation_with_field_description) + documentation += "\n" + + if hasattr(model, 'Config') and hasattr(model.Config, + 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" + json_example = json.dumps(model.Config.json_schema_extra['example']) + documentation += format_multiline_description(json_example, 2) + "\n" + + return documentation + + +def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True) -> str: + """ + Generate text documentation for a Pydantic model field. + + Args: + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation for the field. + """ + indent = ' ' * depth + + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "" + + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + elif get_origin(field_type) == Union: + element_types = get_args(field_type) + types = [] + for element_type in element_types: + types.append(format_model_and_field_name(element_type.__name__)) + field_text = f"{indent}{field_name} ({' or '.join(types)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + else: + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + + if not documentation_with_field_description: + return field_text + + if field_description != "": + field_text += f"{indent} Description: " + field_description + "\n" + + # Check for and include field-specific examples if available + if hasattr(model, 'Config') and hasattr(model.Config, + 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra['example'].get(field_name) + if field_example is not None: + example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example + field_text += f"{indent} Example: {example_text}\n" + + if isclass(field_type) and issubclass(field_type, BaseModel): + field_text += f"{indent} Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_text += generate_field_text(name, type_, field_type, depth + 2) + + return field_text + + +def format_multiline_description(description: str, indent_level: int) -> str: + """ + Format a multiline description with proper indentation. + + Args: + description (str): Multiline description. + indent_level (int): Indentation level. + + Returns: + str: Formatted multiline description. + """ + indent = ' ' * indent_level + return indent + description.replace('\n', '\n' + indent) + + +def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf", + documentation_file_path="./grammar_documentation.md"): + """ + Save GBNF grammar and documentation to specified files. + + Args: + grammar (str): GBNF grammar string. + documentation (str): Documentation string. + grammar_file_path (str): File path to save the GBNF grammar. + documentation_file_path (str): File path to save the documentation. + + Returns: + None + """ + try: + with open(grammar_file_path, 'w') as file: + file.write(grammar + get_primitive_grammar(grammar)) + print(f"Grammar successfully saved to {grammar_file_path}") + except IOError as e: + print(f"An error occurred while saving the grammar file: {e}") + + try: + with open(documentation_file_path, 'w') as file: + file.write(documentation) + print(f"Documentation successfully saved to {documentation_file_path}") + except IOError as e: + print(f"An error occurred while saving the documentation file: {e}") + + +def remove_empty_lines(string): + """ + Remove empty lines from a string. + + Args: + string (str): Input string. + + Returns: + str: String with empty lines removed. + """ + lines = string.splitlines() + non_empty_lines = [line for line in lines if line.strip() != ""] + string_no_empty_lines = "\n".join(non_empty_lines) + return string_no_empty_lines + + +def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list, + grammar_file_path="./generated_grammar.gbnf", + documentation_file_path="./generated_grammar_documentation.md", + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation, and save them to specified files. + + Args: + pydantic_model_list: List of Pydantic model classes. + grammar_file_path (str): File path to save the generated GBNF grammar. + documentation_file_path (str): File path to save the generated documentation. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + None + """ + documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar) + save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path) + + +def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation for a list of Pydantic models. + + Args: + pydantic_model_list: List of Pydantic model classes. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + tuple: GBNF grammar string, documentation string. + """ + documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) + return grammar, documentation + + +def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict], + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True): + """ + Generate GBNF grammar and documentation from a list of dictionaries. + + Args: + dictionaries (List[dict]): List of dictionaries representing Pydantic models. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + tuple: GBNF grammar string, documentation string. + """ + pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries) + documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, + outer_object_content, list_of_outputs) + grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) + return grammar, documentation + + +def create_dynamic_model_from_function(func: Callable): + """ + Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. + + Args: + func (Callable): A function with type hints from which to create the model. + + Returns: + A dynamic Pydantic model class with the provided function as a 'run' method. + """ + # Extracting type hints from the provided function + type_hints = get_type_hints(func) + type_hints.pop('return', None) + + # Handling default values and annotations + dynamic_fields = {} + defaults = getattr(func, '__defaults__', ()) or () + defaults_index = len(type_hints) - len(defaults) + + for index, (name, typ) in enumerate(type_hints.items()): + if index >= defaults_index: + default_value = defaults[index - defaults_index] + dynamic_fields[name] = (typ, default_value) + else: + dynamic_fields[name] = (typ, ...) + + # Creating the dynamic model + dynamicModel = create_model(f'{func.__name__}', **dynamic_fields) + + dynamicModel.__doc__ = getdoc(func) + + # Wrapping the original function to handle instance 'self' + def run_method_wrapper(self): + func_args = {name: getattr(self, name) for name in type_hints} + return func(**func_args) + + # Adding the wrapped function as a 'run' method + setattr(dynamicModel, 'run', run_method_wrapper) + + return dynamicModel + + +def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): + """ + Add a 'run' method to a dynamic Pydantic model, using the provided function. + + Args: + - model (Type[BaseModel]): Dynamic Pydantic model class. + - func (Callable): Function to be added as a 'run' method to the model. + + Returns: + - Type[BaseModel]: Pydantic model class with the added 'run' method. + """ + + def run_method_wrapper(self): + func_args = {name: getattr(self, name) for name in model.model_fields} + return func(**func_args) + + # Adding the wrapped function as a 'run' method + setattr(model, 'run', run_method_wrapper) + + return model + + +def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): + """ + Create a list of dynamic Pydantic model classes from a list of dictionaries. + + Args: + - dictionaries (List[dict]): List of dictionaries representing model structures. + + Returns: + - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + """ + dynamic_models = [] + for func in dictionaries: + model_name = format_model_and_field_name(func.get("name", "")) + dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name) + dynamic_models.append(dyn_model) + return dynamic_models + + +def map_grammar_names_to_pydantic_model_class(pydantic_model_list): + output = {} + for model in pydantic_model_list: + output[format_model_and_field_name(model.__name__)] = model + + return output + + +from enum import Enum + + +def json_schema_to_python_types(schema): + type_map = { + 'any': Any, + 'string': str, + 'number': float, + 'integer': int, + 'boolean': bool, + 'array': list, + } + return type_map[schema] + + +def list_to_enum(enum_name, values): + return Enum(enum_name, {value: value for value in values}) + + +def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]: + """ + Convert a dictionary to a Pydantic model class. + + Args: + - dictionary (dict): Dictionary representing the model structure. + - model_name (str): Name of the generated Pydantic model. + + Returns: + - Type[BaseModel]: Generated Pydantic model class. + """ + fields = {} + + if "properties" in dictionary: + for field_name, field_data in dictionary.get("properties", {}).items(): + if field_data == 'object': + submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}') + fields[field_name] = (submodel, ...) + else: + field_type = field_data.get('type', 'str') + + if field_data.get("enum", []): + fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...) + if field_type == "array": + items = field_data.get("items", {}) + if items != {}: + array = {"properties": items} + array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items') + fields[field_name] = (List[array_type], ...) + else: + fields[field_name] = (list, ...) + elif field_type == 'object': + submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}') + fields[field_name] = (submodel, ...) + else: + field_type = json_schema_to_python_types(field_type) + fields[field_name] = (field_type, ...) + if "function" in dictionary: + + for field_name, field_data in dictionary.get("function", {}).items(): + if field_name == "name": + model_name = field_data + elif field_name == "description": + fields["__doc__"] = field_data + elif field_name == "parameters": + return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + if "parameters" in dictionary: + field_data = {"function": dictionary} + return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + + custom_model = create_model(model_name, **fields) + return custom_model + + + From fa5c1fb44a2724292da545d6b7cf2a1ac0e0b989 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 12 Jan 2024 20:38:34 +0100 Subject: [PATCH 037/131] backend_sched : fix assignments ggml-ci --- ggml-backend.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ggml-backend.c b/ggml-backend.c index 4c2d8b0b26f18..505dbba476253 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1087,6 +1087,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } } } + + // pass 2.4 expand rest down + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.4"); + } + } + } #ifdef DEBUG_PASS2 fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); #endif @@ -1146,6 +1164,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g ggml_tallocr_t node_allocr = node_allocr(node); + GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now + if (node_allocr != cur_allocr) { sched->splits[cur_split].i_end = i; cur_split++; From f238461236f4e0e18cac1a554af23c7deadc9b01 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 14:02:30 +0200 Subject: [PATCH 038/131] ggml : fix 32-bit ARM compat for IQ2_XS (whisper/1758) * ggml : fix 32-bit ARM compat * ggml : fix fix * ggml : fix fix fix --- ggml-quants.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index a24b4b2441e02..601d155d73696 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -272,10 +272,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 // vaddvq_s16 // vpaddq_s16 +// vpaddq_s32 // vaddvq_s32 // vaddvq_f32 // vmaxvq_f32 // vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 inline static int32_t vaddvq_s16(int16x8_t v) { return @@ -291,6 +294,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { return vcombine_s16(a0, b0); } +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -316,6 +325,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { return res; } +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + // vld1q_s16_x2 // vld1q_u8_x2 // vld1q_u8_x4 @@ -7554,9 +7585,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - int8x16x4_t q2u; - int8x16x4_t q2s; - int8x16x4_t q8b; + ggml_int8x16x4_t q2u; + ggml_int8x16x4_t q2s; + ggml_int8x16x4_t q8b; int32x4x4_t scales32; @@ -7578,7 +7609,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); int32x4_t sumi = vdupq_n_s32(0); for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { - q8b = vld1q_s8_x4(q8); q8 += 64; + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); From de473f5f8e19ba5e659cdf5af65fb9251dce16c5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jan 2024 22:02:43 +0200 Subject: [PATCH 039/131] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 3e2c579d575cd..edcdb530a270b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -979cc23b345006504cfc1f67c0fdf627805e3319 +400c07f00508e6f60fb25405444b5669c365b0a9 From 15ebe59210e7fd9817ff67f51fa1a5ee2d004294 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 13:44:37 +0200 Subject: [PATCH 040/131] convert : update phi-2 to latest HF repo (#4903) * convert : update phi-2 to latest HF repo ggml-ci * py : try to fix flake stuff --- convert-hf-to-gguf.py | 39 +++++++++++++++++++++---------- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 2 ++ llama.cpp | 42 ++++++++++++++++++++++++++-------- 4 files changed, 65 insertions(+), 21 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a1c79fd478c22..b133f3b49f719 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -23,6 +23,15 @@ import gguf +# check for any of the given keys in the dictionary and return the value of the first key found +def get_key_opts(d, keys): + for k in keys: + if k in d: + return d[k] + print(f"Could not find any of {keys}") + sys.exit() + + ###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): @@ -257,10 +266,11 @@ def _set_vocab_gpt2(self): toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) + if hasattr(tokenizer, "added_tokens_decoder"): + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) else: tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) @@ -1068,17 +1078,22 @@ def write_tensors(self): class Phi2Model(Model): def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] + block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) + + rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) + n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) + n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f0a1c51f8dbe8..972b4e9a73766 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -389,6 +389,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 24a0890378496..e5b146106b4ad 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -191,6 +191,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.w1", # qwen "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo ), @@ -232,6 +233,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon "h.{bid}.mlp.c_proj", # gpt2 "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo ), diff --git a/llama.cpp b/llama.cpp index fe1d8947c73a0..1d2eb569f01ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -574,6 +574,9 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, @@ -3676,8 +3679,19 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + + if (layer.wqkv == nullptr) { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + } layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); @@ -5637,15 +5651,25 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); - cb(cur, "wqkv", il); + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); + if (model.layers[il].wqkv) { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + } else { + Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); From ee8243adaa9a9f51ff449213383874e49efe368f Mon Sep 17 00:00:00 2001 From: makomk Date: Sat, 13 Jan 2024 14:16:11 +0000 Subject: [PATCH 041/131] server : fix crash with multimodal models without BOS token (#4904) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c1ab8f9dc477c..7b33aea1f4fd5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1835,7 +1835,7 @@ struct llama_server_context slot.cache_tokens = prompt_tokens; - if (slot.n_past == slot.num_prompt_tokens) + if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); From 356327feb3f66980ab687040495d722696d98970 Mon Sep 17 00:00:00 2001 From: Ziad Ben Hadj-Alouane Date: Sat, 13 Jan 2024 09:20:46 -0500 Subject: [PATCH 042/131] server : fix deadlock that occurs in multi-prompt scenarios (#4905) * * fix deadlock * * dont ruint all whitespace --- examples/server/server.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7b33aea1f4fd5..79eacf828346f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1350,14 +1350,17 @@ struct llama_server_context res.result_json["model"] = slot.oaicompat_model; } + queue_results.push_back(res); + condition_results.notify_all(); + + // done with results, unlock + lock.unlock(); + // parent multitask, if any, needs to be updated if (slot.multitask_id != -1) { update_multi_task(slot.multitask_id, slot.task_id, res); } - - queue_results.push_back(res); - condition_results.notify_all(); } void send_embedding(llama_client_slot &slot) @@ -1603,6 +1606,7 @@ struct llama_server_context } // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue + std::vector agg_results; auto queue_iterator = queue_multitasks.begin(); while (queue_iterator != queue_multitasks.end()) { @@ -1623,8 +1627,9 @@ struct llama_server_context } aggregate_result.result_json = json{ "results", result_jsons }; - std::lock_guard lock(mutex_results); - queue_results.push_back(aggregate_result); + + agg_results.push_back(aggregate_result); + condition_results.notify_all(); queue_iterator = queue_multitasks.erase(queue_iterator); @@ -1634,6 +1639,13 @@ struct llama_server_context ++queue_iterator; } } + + // done with tasks, unlock + lock.unlock(); + + // copy aggregate results of complete multi-tasks to the results queue + std::lock_guard lock_results(mutex_results); + queue_results.insert(queue_results.end(), agg_results.begin(), agg_results.end()); } bool update_slots() { From 7dc78764e2ff86512e6e31cb0fcb8087df4b4708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 13 Jan 2024 15:52:53 +0100 Subject: [PATCH 043/131] compare-llama-bench: tweak output format (#4910) --- scripts/compare-llama-bench.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index bc1714487525e..70737f976c5b5 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -10,15 +10,15 @@ try: import git from tabulate import tabulate -except ImportError: +except ImportError as e: print("ERROR: the following Python libraries are required: GitPython, tabulate.") - sys.exit(1) + raise e # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ - "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen" + "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas", + "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", + "type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen" ] # Properties that are boolean and are converted to Yes/No for the table: @@ -37,6 +37,7 @@ DEFAULT_SHOW = ["model_type"] # Always show these properties by default. DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default. GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables. +MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"} DESCRIPTION = """Creates tables from llama-bench data written to an SQLite database. Example usage (Linux): @@ -308,8 +309,13 @@ def get_rows(properties): if gpu_blas and "gpu_info" not in properties_different: show.append("gpu_info") - show += DEFAULT_SHOW show += properties_different + + index_default = 0 + for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]: + if prop in show: + index_default += 1 + show = show[:index_default] + DEFAULT_SHOW + show[index_default:] for prop in DEFAULT_HIDE: try: show.remove(prop) @@ -334,6 +340,12 @@ def get_rows(properties): for row_table in table: row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No" +if "model_type" in show: + ip = show.index("model_type") + for (old, new) in MODEL_SUFFIX_REPLACE.items(): + for row_table in table: + row_table[ip] = row_table[ip].replace(old, new) + if "model_size" in show: ip = show.index("model_size") for row_table in table: @@ -341,10 +353,16 @@ def get_rows(properties): if "gpu_info" in show: ip = show.index("gpu_info") - for gns in GPU_NAME_STRIP: - for row_table in table: + for row_table in table: + for gns in GPU_NAME_STRIP: row_table[ip] = row_table[ip].replace(gns, "") + gpu_names = row_table[ip].split("/") + num_gpus = len(gpu_names) + all_names_the_same = len(set(gpu_names)) == 1 + if len(gpu_names) >= 2 and all_names_the_same: + row_table[ip] = f"{num_gpus}x {gpu_names[0]}" + headers = [PRETTY_NAMES[p] for p in show] headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] From b38b5e93ae31019e87f692b69d27124eae6aac02 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 18:03:45 +0200 Subject: [PATCH 044/131] metal : refactor kernel loading code (#4794) * metal : detect more GPU families * metal : refactor kernel loading * metal : set kernel family requirements * metal : fix kernel init + fix compile options * metal : take into account simdgroup reduction support * metal : print only skipped kernels * metal : fix check for simdgroup reduction support * metal : check for Metal 3 * metal : free allocations * metal : normalize encoder:setComputePipelineStatus calls ggml-ci * metal : fix Metal3 family check ggml-ci * metal : check for simdgroup matrix mul. feature ggml-ci --- ggml-metal.m | 1050 +++++++++++++++++++++++++------------------------- 1 file changed, 531 insertions(+), 519 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index c03624073fb61..6c28a7ee32d3f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -26,6 +26,8 @@ #define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE) +#define GGML_METAL_MAX_KERNELS 256 + struct ggml_metal_buffer { const char * name; @@ -35,6 +37,134 @@ id metal; }; +struct ggml_metal_kernel { + id function; + id pipeline; +}; + +enum ggml_metal_kernel_type { + GGML_METAL_KERNEL_TYPE_ADD, + GGML_METAL_KERNEL_TYPE_ADD_ROW, + GGML_METAL_KERNEL_TYPE_MUL, + GGML_METAL_KERNEL_TYPE_MUL_ROW, + GGML_METAL_KERNEL_TYPE_DIV, + GGML_METAL_KERNEL_TYPE_DIV_ROW, + GGML_METAL_KERNEL_TYPE_SCALE, + GGML_METAL_KERNEL_TYPE_SCALE_4, + GGML_METAL_KERNEL_TYPE_TANH, + GGML_METAL_KERNEL_TYPE_RELU, + GGML_METAL_KERNEL_TYPE_GELU, + GGML_METAL_KERNEL_TYPE_GELU_QUICK, + GGML_METAL_KERNEL_TYPE_SILU, + GGML_METAL_KERNEL_TYPE_SOFT_MAX, + GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, + GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, + GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, + GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, + GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, + GGML_METAL_KERNEL_TYPE_RMS_NORM, + GGML_METAL_KERNEL_TYPE_GROUP_NORM, + GGML_METAL_KERNEL_TYPE_NORM, + GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, + //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, + //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, + //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_ROPE_F32, + GGML_METAL_KERNEL_TYPE_ROPE_F16, + GGML_METAL_KERNEL_TYPE_ALIBI_F32, + GGML_METAL_KERNEL_TYPE_IM2COL_F16, + GGML_METAL_KERNEL_TYPE_UPSCALE_F32, + GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, + GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, + GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, + GGML_METAL_KERNEL_TYPE_CPY_F32_F16, + GGML_METAL_KERNEL_TYPE_CPY_F32_F32, + GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, + GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, + GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, + //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, + //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, + GGML_METAL_KERNEL_TYPE_CPY_F16_F16, + GGML_METAL_KERNEL_TYPE_CPY_F16_F32, + GGML_METAL_KERNEL_TYPE_CONCAT, + GGML_METAL_KERNEL_TYPE_SQR, + GGML_METAL_KERNEL_TYPE_SUM_ROWS, + + GGML_METAL_KERNEL_TYPE_COUNT +}; + struct ggml_metal_context { int n_cb; @@ -50,134 +180,13 @@ int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS]; + int concur_list[GGML_MAX_CONCUR]; int concur_list_len; - // custom kernels -#define GGML_METAL_DECL_KERNEL(name) \ - id function_##name; \ - id pipeline_##name - - GGML_METAL_DECL_KERNEL(add); - GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast - GGML_METAL_DECL_KERNEL(mul); - GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast - GGML_METAL_DECL_KERNEL(div); - GGML_METAL_DECL_KERNEL(div_row); - GGML_METAL_DECL_KERNEL(scale); - GGML_METAL_DECL_KERNEL(scale_4); - GGML_METAL_DECL_KERNEL(tanh); - GGML_METAL_DECL_KERNEL(relu); - GGML_METAL_DECL_KERNEL(gelu); - GGML_METAL_DECL_KERNEL(gelu_quick); - GGML_METAL_DECL_KERNEL(silu); - GGML_METAL_DECL_KERNEL(soft_max); - GGML_METAL_DECL_KERNEL(soft_max_4); - GGML_METAL_DECL_KERNEL(diag_mask_inf); - GGML_METAL_DECL_KERNEL(diag_mask_inf_8); - GGML_METAL_DECL_KERNEL(get_rows_f32); - GGML_METAL_DECL_KERNEL(get_rows_f16); - GGML_METAL_DECL_KERNEL(get_rows_q4_0); - GGML_METAL_DECL_KERNEL(get_rows_q4_1); - GGML_METAL_DECL_KERNEL(get_rows_q5_0); - GGML_METAL_DECL_KERNEL(get_rows_q5_1); - GGML_METAL_DECL_KERNEL(get_rows_q8_0); - GGML_METAL_DECL_KERNEL(get_rows_q2_K); - GGML_METAL_DECL_KERNEL(get_rows_q3_K); - GGML_METAL_DECL_KERNEL(get_rows_q4_K); - GGML_METAL_DECL_KERNEL(get_rows_q5_K); - GGML_METAL_DECL_KERNEL(get_rows_q6_K); - GGML_METAL_DECL_KERNEL(get_rows_i32); - GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs); - GGML_METAL_DECL_KERNEL(get_rows_iq2_xs); - GGML_METAL_DECL_KERNEL(rms_norm); - GGML_METAL_DECL_KERNEL(group_norm); - GGML_METAL_DECL_KERNEL(norm); - GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); - GGML_METAL_DECL_KERNEL(mul_mv_f16_f16); - GGML_METAL_DECL_KERNEL(mul_mv_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row); - GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4); - GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32); - GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32); - //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16); - GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32); - //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row); - //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4); - GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32); - GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32); - GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); - GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32); - GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32); - GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32); - GGML_METAL_DECL_KERNEL(rope_f32); - GGML_METAL_DECL_KERNEL(rope_f16); - GGML_METAL_DECL_KERNEL(alibi_f32); - GGML_METAL_DECL_KERNEL(im2col_f16); - GGML_METAL_DECL_KERNEL(upscale_f32); - GGML_METAL_DECL_KERNEL(pad_f32); - GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc); - GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc); - GGML_METAL_DECL_KERNEL(leaky_relu_f32); - GGML_METAL_DECL_KERNEL(cpy_f32_f16); - GGML_METAL_DECL_KERNEL(cpy_f32_f32); - GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); - GGML_METAL_DECL_KERNEL(cpy_f32_q4_0); - GGML_METAL_DECL_KERNEL(cpy_f32_q4_1); - //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); - //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); - GGML_METAL_DECL_KERNEL(cpy_f16_f16); - GGML_METAL_DECL_KERNEL(cpy_f16_f32); - GGML_METAL_DECL_KERNEL(concat); - GGML_METAL_DECL_KERNEL(sqr); - GGML_METAL_DECL_KERNEL(sum_rows); - -#undef GGML_METAL_DECL_KERNEL + bool support_simdgroup_reduction; + bool support_simdgroup_mm; }; // MSL code @@ -298,19 +307,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ return NULL; } - MTLCompileOptions* options = nil; + // dictionary of preprocessor macros + NSMutableDictionary * prep = [NSMutableDictionary dictionary]; + #ifdef GGML_QKK_64 - options = [MTLCompileOptions new]; - options.preprocessorMacros = @{ @"QK_K" : @(64) }; + prep[@"QK_K"] = @(64); #endif - // try to disable fast-math - // NOTE: this seems to have no effect whatsoever - // instead, in order to disable fast-math, we have to build default.metallib from the command line - // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - // and go through the "pre-compiled library found" path above + + MTLCompileOptions* options = [MTLCompileOptions new]; + options.preprocessorMacros = prep; + //[options setFastMathEnabled:false]; ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + + [options release]; + [prep release]; } if (error) { @@ -323,16 +335,41 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ // print MTL GPU family: GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + const NSInteger MTLGPUFamilyMetal3 = 5001; + // determine max supported GPU family // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { - if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); - break; + { + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + break; + } + } + + for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); + break; + } + } + + for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i); + break; + } } } + ctx->support_simdgroup_reduction = [ctx->device supportsFamily:MTLGPUFamilyApple7]; + ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3]; + + ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7]; + + GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); + GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); if (ctx->device.maxTransferRate != 0) { @@ -346,141 +383,152 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ { NSError * error = nil; + for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { + ctx->kernels[i].function = nil; + ctx->kernels[i].pipeline = nil; + } + /* - GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ - (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ - (int) ctx->pipeline_##name.threadExecutionWidth); \ + GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ + (int) kernel->pipeline.threadExecutionWidth); \ */ -#define GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ - return NULL; \ +#define GGML_METAL_ADD_KERNEL(e, name, supported) \ + if (supported) { \ + struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ + kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ + GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ + (int) kernel->pipeline.threadExecutionWidth); \ + if (error) { \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + return NULL; \ + } \ + } else { \ + GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \ } - GGML_METAL_ADD_KERNEL(add); - GGML_METAL_ADD_KERNEL(add_row); - GGML_METAL_ADD_KERNEL(mul); - GGML_METAL_ADD_KERNEL(mul_row); - GGML_METAL_ADD_KERNEL(div); - GGML_METAL_ADD_KERNEL(div_row); - GGML_METAL_ADD_KERNEL(scale); - GGML_METAL_ADD_KERNEL(scale_4); - GGML_METAL_ADD_KERNEL(tanh); - GGML_METAL_ADD_KERNEL(relu); - GGML_METAL_ADD_KERNEL(gelu); - GGML_METAL_ADD_KERNEL(gelu_quick); - GGML_METAL_ADD_KERNEL(silu); - GGML_METAL_ADD_KERNEL(soft_max); - GGML_METAL_ADD_KERNEL(soft_max_4); - GGML_METAL_ADD_KERNEL(diag_mask_inf); - GGML_METAL_ADD_KERNEL(diag_mask_inf_8); - GGML_METAL_ADD_KERNEL(get_rows_f32); - GGML_METAL_ADD_KERNEL(get_rows_f16); - GGML_METAL_ADD_KERNEL(get_rows_q4_0); - GGML_METAL_ADD_KERNEL(get_rows_q4_1); - GGML_METAL_ADD_KERNEL(get_rows_q5_0); - GGML_METAL_ADD_KERNEL(get_rows_q5_1); - GGML_METAL_ADD_KERNEL(get_rows_q8_0); - GGML_METAL_ADD_KERNEL(get_rows_q2_K); - GGML_METAL_ADD_KERNEL(get_rows_q3_K); - GGML_METAL_ADD_KERNEL(get_rows_q4_K); - GGML_METAL_ADD_KERNEL(get_rows_q5_K); - GGML_METAL_ADD_KERNEL(get_rows_q6_K); - GGML_METAL_ADD_KERNEL(get_rows_i32); - GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs); - GGML_METAL_ADD_KERNEL(get_rows_iq2_xs); - GGML_METAL_ADD_KERNEL(rms_norm); - GGML_METAL_ADD_KERNEL(group_norm); - GGML_METAL_ADD_KERNEL(norm); - GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mv_f16_f16); - GGML_METAL_ADD_KERNEL(mul_mv_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row); - GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4); - GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32); - GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32); - //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16); - GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32); - //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row); - //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4); - GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32); - GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32); - if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { - GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32); - GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32); - GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32); - } - GGML_METAL_ADD_KERNEL(rope_f32); - GGML_METAL_ADD_KERNEL(rope_f16); - GGML_METAL_ADD_KERNEL(alibi_f32); - GGML_METAL_ADD_KERNEL(im2col_f16); - GGML_METAL_ADD_KERNEL(upscale_f32); - GGML_METAL_ADD_KERNEL(pad_f32); - GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc); - GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc); - GGML_METAL_ADD_KERNEL(leaky_relu_f32); - GGML_METAL_ADD_KERNEL(cpy_f32_f16); - GGML_METAL_ADD_KERNEL(cpy_f32_f32); - GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); - GGML_METAL_ADD_KERNEL(cpy_f32_q4_0); - GGML_METAL_ADD_KERNEL(cpy_f32_q4_1); - //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); - //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); - GGML_METAL_ADD_KERNEL(cpy_f16_f16); - GGML_METAL_ADD_KERNEL(cpy_f16_f32); - GGML_METAL_ADD_KERNEL(concat); - GGML_METAL_ADD_KERNEL(sqr); - GGML_METAL_ADD_KERNEL(sum_rows); - -#undef GGML_METAL_ADD_KERNEL + // simd_sum and simd_max requires MTLGPUFamilyApple7 + + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); } return ctx; @@ -488,137 +536,21 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); -#define GGML_METAL_DEL_KERNEL(name) \ - [ctx->function_##name release]; \ - [ctx->pipeline_##name release]; - - GGML_METAL_DEL_KERNEL(add); - GGML_METAL_DEL_KERNEL(add_row); - GGML_METAL_DEL_KERNEL(mul); - GGML_METAL_DEL_KERNEL(mul_row); - GGML_METAL_DEL_KERNEL(div); - GGML_METAL_DEL_KERNEL(div_row); - GGML_METAL_DEL_KERNEL(scale); - GGML_METAL_DEL_KERNEL(scale_4); - GGML_METAL_DEL_KERNEL(tanh); - GGML_METAL_DEL_KERNEL(relu); - GGML_METAL_DEL_KERNEL(gelu); - GGML_METAL_DEL_KERNEL(gelu_quick); - GGML_METAL_DEL_KERNEL(silu); - GGML_METAL_DEL_KERNEL(soft_max); - GGML_METAL_DEL_KERNEL(soft_max_4); - GGML_METAL_DEL_KERNEL(diag_mask_inf); - GGML_METAL_DEL_KERNEL(diag_mask_inf_8); - GGML_METAL_DEL_KERNEL(get_rows_f32); - GGML_METAL_DEL_KERNEL(get_rows_f16); - GGML_METAL_DEL_KERNEL(get_rows_q4_0); - GGML_METAL_DEL_KERNEL(get_rows_q4_1); - GGML_METAL_DEL_KERNEL(get_rows_q5_0); - GGML_METAL_DEL_KERNEL(get_rows_q5_1); - GGML_METAL_DEL_KERNEL(get_rows_q8_0); - GGML_METAL_DEL_KERNEL(get_rows_q2_K); - GGML_METAL_DEL_KERNEL(get_rows_q3_K); - GGML_METAL_DEL_KERNEL(get_rows_q4_K); - GGML_METAL_DEL_KERNEL(get_rows_q5_K); - GGML_METAL_DEL_KERNEL(get_rows_q6_K); - GGML_METAL_DEL_KERNEL(get_rows_i32); - GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs); - GGML_METAL_DEL_KERNEL(get_rows_iq2_xs); - GGML_METAL_DEL_KERNEL(rms_norm); - GGML_METAL_DEL_KERNEL(group_norm); - GGML_METAL_DEL_KERNEL(norm); - GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mv_f16_f16); - GGML_METAL_DEL_KERNEL(mul_mv_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row); - GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4); - GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32); - GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32); - //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16); - GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32); - //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row); - //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4); - GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32); - GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32); - if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { - GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32); - GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32); - GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32); - } - GGML_METAL_DEL_KERNEL(rope_f32); - GGML_METAL_DEL_KERNEL(rope_f16); - GGML_METAL_DEL_KERNEL(alibi_f32); - GGML_METAL_DEL_KERNEL(im2col_f16); - GGML_METAL_DEL_KERNEL(upscale_f32); - GGML_METAL_DEL_KERNEL(pad_f32); - GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc); - GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc); - GGML_METAL_DEL_KERNEL(leaky_relu_f32); - GGML_METAL_DEL_KERNEL(cpy_f32_f16); - GGML_METAL_DEL_KERNEL(cpy_f32_f32); - GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); - GGML_METAL_DEL_KERNEL(cpy_f32_q4_0); - GGML_METAL_DEL_KERNEL(cpy_f32_q4_1); - //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); - //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); - GGML_METAL_DEL_KERNEL(cpy_f16_f16); - GGML_METAL_DEL_KERNEL(cpy_f16_f32); - GGML_METAL_DEL_KERNEL(concat); - GGML_METAL_DEL_KERNEL(sqr); - GGML_METAL_DEL_KERNEL(sum_rows); - -#undef GGML_METAL_DEL_KERNEL for (int i = 0; i < ctx->n_buffers; ++i) { [ctx->buffers[i].metal release]; } + for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { + if (ctx->kernels[i].pipeline) { + [ctx->kernels[i].pipeline release]; + } + + if (ctx->kernels[i].function) { + [ctx->kernels[i].function release]; + } + } + [ctx->library release]; [ctx->queue release]; [ctx->device release]; @@ -930,7 +862,7 @@ void ggml_metal_graph_find_concurrency( } } -static bool ggml_metal_supports_op(const struct ggml_tensor * op) { +static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -956,9 +888,11 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SUM_ROWS: + return true; case GGML_OP_SOFT_MAX: case GGML_OP_RMS_NORM: case GGML_OP_GROUP_NORM: + return ctx->support_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ALIBI: case GGML_OP_ROPE: @@ -967,9 +901,10 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { case GGML_OP_PAD: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: + return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - return true; + return ctx->support_simdgroup_reduction; case GGML_OP_CPY: case GGML_OP_DUP: case GGML_OP_CONT: @@ -1007,6 +942,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { return false; } } + bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @@ -1077,7 +1013,7 @@ bool ggml_metal_graph_compute( } break; } - if (!ggml_metal_supports_op(dst)) { + if (!ggml_metal_supports_op(ctx, dst)) { GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); GGML_ASSERT(!"unsupported op"); } @@ -1143,7 +1079,9 @@ bool ggml_metal_graph_compute( { const int64_t nb = ne00; - [encoder setComputePipelineState:ctx->pipeline_concat]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1197,18 +1135,18 @@ bool ggml_metal_graph_compute( nb = ne00 / 4; switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break; - case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break; - case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break; + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; default: GGML_ASSERT(false); } bcast_row = true; } else { switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->pipeline_add; break; - case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break; - case GGML_OP_DIV: pipeline = ctx->pipeline_div; break; + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; default: GGML_ASSERT(false); } } @@ -1275,9 +1213,9 @@ bool ggml_metal_graph_compute( // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel - const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00); + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; - [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -1297,10 +1235,14 @@ bool ggml_metal_graph_compute( [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } - [encoder setComputePipelineState:ctx->pipeline_add]; + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1330,7 +1272,7 @@ bool ggml_metal_graph_compute( [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00); + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -1342,13 +1284,16 @@ bool ggml_metal_graph_compute( int64_t n = ggml_nelements(dst); + id pipeline = nil; + if (n % 4 == 0) { n /= 4; - [encoder setComputePipelineState:ctx->pipeline_scale_4]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; } else { - [encoder setComputePipelineState:ctx->pipeline_scale]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; @@ -1359,7 +1304,9 @@ bool ggml_metal_graph_compute( switch (ggml_get_unary_op(gf->nodes[i])) { case GGML_UNARY_OP_TANH: { - [encoder setComputePipelineState:ctx->pipeline_tanh]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1369,7 +1316,9 @@ bool ggml_metal_graph_compute( } break; case GGML_UNARY_OP_RELU: { - [encoder setComputePipelineState:ctx->pipeline_relu]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1379,7 +1328,9 @@ bool ggml_metal_graph_compute( } break; case GGML_UNARY_OP_GELU: { - [encoder setComputePipelineState:ctx->pipeline_gelu]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1390,7 +1341,9 @@ bool ggml_metal_graph_compute( } break; case GGML_UNARY_OP_GELU_QUICK: { - [encoder setComputePipelineState:ctx->pipeline_gelu_quick]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1401,7 +1354,9 @@ bool ggml_metal_graph_compute( } break; case GGML_UNARY_OP_SILU: { - [encoder setComputePipelineState:ctx->pipeline_silu]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1420,18 +1375,23 @@ bool ggml_metal_graph_compute( { GGML_ASSERT(ggml_is_contiguous(src0)); - [encoder setComputePipelineState:ctx->pipeline_sqr]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; const int64_t n = ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SUM_ROWS: { GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - [encoder setComputePipelineState:ctx->pipeline_sum_rows]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -1465,20 +1425,23 @@ bool ggml_metal_graph_compute( { int nth = 32; // SIMD width + id pipeline = nil; + if (ne00%4 == 0) { while (nth < ne00/4 && nth < 256) { nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; } else { while (nth < ne00 && nth < 1024) { nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_soft_max]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; } const float scale = ((float *) dst->op_params)[0]; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; if (id_src1) { [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -1498,11 +1461,15 @@ bool ggml_metal_graph_compute( { const int n_past = ((int32_t *)(dst->op_params))[0]; + id pipeline = nil; + if (ne00%8 == 0) { - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; } else { - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -1562,23 +1529,28 @@ bool ggml_metal_graph_compute( ne00 % 32 == 0 && ne00 >= 64 && (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + + id pipeline = nil; + switch (src0->type) { - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; - case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; - case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; - case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break; - case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break; - case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break; - case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; - case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; - case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; - case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; - case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; - case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break; - case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1602,12 +1574,14 @@ bool ggml_metal_graph_compute( int nrows = 1; //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + id pipeline = nil; + // use custom matrix x vector kernel switch (src0t) { case GGML_TYPE_F32: { GGML_ASSERT(src1t == GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; nrows = 4; } break; case GGML_TYPE_F16: @@ -1616,16 +1590,16 @@ bool ggml_metal_graph_compute( nth1 = 1; if (src1t == GGML_TYPE_F32) { if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; nrows = ne11; } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; nrows = 4; } } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; nrows = 4; } } break; @@ -1633,73 +1607,73 @@ bool ggml_metal_graph_compute( { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; } break; case GGML_TYPE_Q4_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; } break; case GGML_TYPE_Q5_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; } break; case GGML_TYPE_Q5_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; } break; case GGML_TYPE_Q8_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; } break; case GGML_TYPE_Q2_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; } break; case GGML_TYPE_Q3_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; } break; case GGML_TYPE_Q4_K: { nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; } break; case GGML_TYPE_Q5_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; } break; case GGML_TYPE_Q6_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; } break; case GGML_TYPE_IQ2_XXS: { nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; } break; case GGML_TYPE_IQ2_XS: { nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; } break; default: { @@ -1712,6 +1686,7 @@ bool ggml_metal_graph_compute( GGML_ASSERT(ne00 >= nth0*nth1); } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1818,23 +1793,28 @@ bool ggml_metal_graph_compute( if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && ne20 % 32 == 0 && ne20 >= 64 && ne11 > ne11_mm_min) { + + id pipeline = nil; + switch (src2->type) { - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; - case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break; - case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break; - case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break; - case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break; - case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break; - case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break; - case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break; - case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break; - case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; - case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; - case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break; - case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1874,91 +1854,93 @@ bool ggml_metal_graph_compute( int nrows = 1; //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + id pipeline = nil; + // use custom matrix x vector kernel switch (src2t) { case GGML_TYPE_F32: { GGML_ASSERT(src1t == GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; } break; case GGML_TYPE_F16: { GGML_ASSERT(src1t == GGML_TYPE_F32); nth0 = 32; nth1 = 1; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; } break; case GGML_TYPE_Q4_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; } break; case GGML_TYPE_Q4_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; } break; case GGML_TYPE_Q5_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; } break; case GGML_TYPE_Q5_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; } break; case GGML_TYPE_Q8_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; } break; case GGML_TYPE_Q2_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; } break; case GGML_TYPE_Q3_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; } break; case GGML_TYPE_Q4_K: { nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; } break; case GGML_TYPE_Q5_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; } break; case GGML_TYPE_Q6_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; } break; case GGML_TYPE_IQ2_XXS: { nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; } break; case GGML_TYPE_IQ2_XS: { nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32]; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; } break; default: { @@ -1973,6 +1955,7 @@ bool ggml_metal_graph_compute( const int64_t _ne1 = 1; // kernels needs a reference in constant memory + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -2040,25 +2023,28 @@ bool ggml_metal_graph_compute( } break; case GGML_OP_GET_ROWS: { + id pipeline = nil; + switch (src0->type) { - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break; - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; - case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; - case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; - case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break; - case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break; - case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break; - case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; - case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; - case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; - case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; - case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; - case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break; - case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break; - case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -2086,7 +2072,9 @@ bool ggml_metal_graph_compute( nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_rms_norm]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2115,7 +2103,9 @@ bool ggml_metal_graph_compute( // nth *= 2; //} - [encoder setComputePipelineState:ctx->pipeline_group_norm]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2137,7 +2127,9 @@ bool ggml_metal_graph_compute( const int nth = MIN(256, ne00); - [encoder setComputePipelineState:ctx->pipeline_norm]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2164,7 +2156,9 @@ bool ggml_metal_graph_compute( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2209,12 +2203,15 @@ bool ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + id pipeline = nil; + switch (src0->type) { - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break; default: GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -2277,12 +2274,15 @@ bool ggml_metal_graph_compute( const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + id pipeline = nil; + switch (src0->type) { case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break; - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; default: GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; @@ -2305,7 +2305,9 @@ bool ggml_metal_graph_compute( const int sf = dst->op_params[0]; - [encoder setComputePipelineState:ctx->pipeline_upscale_f32]; + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -2326,7 +2328,7 @@ bool ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; - const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0); + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -2334,7 +2336,9 @@ bool ggml_metal_graph_compute( { GGML_ASSERT(src0->type == GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_pad_f32]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -2367,12 +2371,15 @@ bool ggml_metal_graph_compute( enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + id pipeline = nil; + switch (order) { - case GGML_SORT_ASC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc]; break; - case GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break; + case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; default: GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2386,7 +2393,9 @@ bool ggml_metal_graph_compute( float slope; memcpy(&slope, dst->op_params, sizeof(float)); - [encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32]; + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; @@ -2403,33 +2412,36 @@ bool ggml_metal_graph_compute( int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); + id pipeline = nil; + switch (src0t) { case GGML_TYPE_F32: { GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); switch (dstt) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; - case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break; - case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break; - case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break; - //case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break; - //case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; + //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; + //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; default: GGML_ASSERT(false && "not implemented"); }; } break; case GGML_TYPE_F16: { switch (dstt) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; default: GGML_ASSERT(false && "not implemented"); }; } break; default: GGML_ASSERT(false && "not implemented"); } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2794,9 +2806,9 @@ static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return ggml_metal_supports_op(op); + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - UNUSED(backend); + return ggml_metal_supports_op(metal_ctx, op); } static struct ggml_backend_i ggml_backend_metal_i = { From c30b1ef39aeba497a943416d2897d69fee055b96 Mon Sep 17 00:00:00 2001 From: texmex76 <40733439+texmex76@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:06:20 +0100 Subject: [PATCH 045/131] gguf : fix potential infinite for-loop (#4600) Co-authored-by: Bernhard Gstrein --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 6dbd7626c9e23..de6ef34bdde59 100644 --- a/ggml.c +++ b/ggml.c @@ -19184,7 +19184,7 @@ void gguf_free(struct gguf_context * ctx) { if (ctx->kv) { // free string memory - not great.. - for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; if (kv->key.data) { @@ -19200,7 +19200,7 @@ void gguf_free(struct gguf_context * ctx) { if (kv->type == GGUF_TYPE_ARRAY) { if (kv->value.arr.data) { if (kv->value.arr.type == GGUF_TYPE_STRING) { - for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; if (str->data) { free(str->data); @@ -19216,7 +19216,7 @@ void gguf_free(struct gguf_context * ctx) { } if (ctx->infos) { - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; if (info->name.data) { From 722d33f34ec74c6f7046109f936d0928ffe171bc Mon Sep 17 00:00:00 2001 From: Yann Follet <131855179+YannFollet@users.noreply.github.com> Date: Sun, 14 Jan 2024 00:09:08 +0800 Subject: [PATCH 046/131] main : add parameter --no-display-prompt (#4541) * add the parameter : --no-display-prompt , combine with --log-disable it will display only the generated tokens * remove empty line --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 6 +++++- common/common.h | 1 + examples/main/main.cpp | 7 ++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 322b9f91e5041..c11006bcb9175 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -617,6 +617,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.numa = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; + } else if (arg == "--no-display-prompt") { + params.display_prompt = false; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { invalid_param = true; @@ -936,11 +938,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif + printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); + printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); printf(" group-attention factor (default: %d)\n", params.grp_attn_n); printf(" -gaw N, --grp-attn-w N\n"); printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); - printf(" --verbose-prompt print prompt before generation\n"); printf(" -dkvc, --dump-kv-cache\n"); printf(" verbose print of the KV cache\n"); printf(" -nkvo, --no-kv-offload\n"); @@ -1582,6 +1585,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); } // diff --git a/common/common.h b/common/common.h index f29be5b5ab87f..096468243d88c 100644 --- a/common/common.h +++ b/common/common.h @@ -126,6 +126,7 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation + bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c53b29978657c..58b7f807a9cca 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -477,6 +477,7 @@ int main(int argc, char ** argv) { bool is_antiprompt = false; bool input_echo = true; + bool display = true; bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); int n_past = 0; @@ -491,6 +492,7 @@ int main(int argc, char ** argv) { // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); + display = params.display_prompt; std::vector embd; std::vector embd_guidance; @@ -707,7 +709,7 @@ int main(int argc, char ** argv) { } // display text - if (input_echo) { + if (input_echo && display) { for (auto id : embd) { const std::string token_str = llama_token_to_piece(ctx, id); printf("%s", token_str.c_str()); @@ -724,6 +726,7 @@ int main(int argc, char ** argv) { // reset color to default if there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { console::set_display(console::reset); + display = true; } // if not currently processing queued inputs; @@ -796,6 +799,7 @@ int main(int argc, char ** argv) { // color user input only console::set_display(console::user_input); + display = params.display_prompt; std::string line; bool another_line = true; @@ -806,6 +810,7 @@ int main(int argc, char ** argv) { // done taking input, reset color console::set_display(console::reset); + display = true; // Add tokens to embd only if the input buffer is non-empty // Entering a empty line lets the user pass control back From 6b48ed089377330cdb362970a51c1c89b6d857a8 Mon Sep 17 00:00:00 2001 From: Someone Date: Sat, 13 Jan 2024 16:29:16 +0000 Subject: [PATCH 047/131] workflows: unbreak nix-build-aarch64, and split it out (#4915) The fix should be just the `sudo apt-get update` --- .github/workflows/nix-ci-aarch64.yml | 55 ++++++++++++++++++++++++++++ .github/workflows/nix-ci.yml | 41 --------------------- 2 files changed, 55 insertions(+), 41 deletions(-) create mode 100644 .github/workflows/nix-ci-aarch64.yml diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml new file mode 100644 index 0000000000000..be7c26d40bb29 --- /dev/null +++ b/.github/workflows/nix-ci-aarch64.yml @@ -0,0 +1,55 @@ +name: Nix aarch64 builds + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] + +jobs: + nix-build-aarch64: + if: ${{ vars.CACHIX_NAME != '' }} + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install QEMU + # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 + run: | + sudo apt-get update + sudo apt-get install -y qemu-user-static qemu-system-aarch64 + sudo usermod -a -G kvm $USER + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-platforms = aarch64-linux + extra-system-features = nixos-test kvm + extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: ${{ vars.CACHIX_NAME }} + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.aarch64-linux" + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --systems aarch64-linux + --flake + ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index a38c6ead456b0..845b93bfb8e97 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -69,44 +69,3 @@ jobs: -- --skip-cached --no-nom --flake ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" - nix-build-aarch64: - if: ${{ vars.CACHIX_NAME != '' }} - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install QEMU - # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 - run: | - sudo apt-get install -y qemu-user-static qemu-system-aarch64 - sudo usermod -a -G kvm $USER - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-platforms = aarch64-linux - extra-system-features = nixos-test kvm - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.aarch64-linux" - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --systems aarch64-linux - --flake - ".#checks.aarch64-linux" From df845cc982e7e2ea7b9900e29d55b15338faa78d Mon Sep 17 00:00:00 2001 From: David Friehs Date: Sat, 13 Jan 2024 17:29:43 +0100 Subject: [PATCH 048/131] llama : minimize size used for state save/load (#4820) * examples : save-load-state: save only required state * llama : only reserve n_vocab * n_batch at most for logits llama_decode asserts that only n_batch tokens are passed each call, and n_ctx is expected to be bigger than n_batch. * llama : always reserve n_vocab * n_batch for logits llama_context de-serialization breaks if the contexts have differing capacity for logits and llama_decode will at maximum resize to n_vocab * n_batch. * llama : only save and restore used logits for batch sizes of 512 this reduces save state in the best case by around 62 MB, which can be a lot if planning to save on each message to allow regenerating messages. * llama : use ostringstream and istringstream for save and load * llama : serialize rng into minimum amount of space required * llama : break session version due to serialization changes --- examples/save-load-state/save-load-state.cpp | 21 ++++---- llama.cpp | 51 +++++++------------- llama.h | 2 +- 3 files changed, 28 insertions(+), 46 deletions(-) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 48d80111010df..ef952e2bd987c 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -45,13 +45,13 @@ int main(int argc, char ** argv) { // save state (rng, logits, embedding and kv_cache) to file { std::vector state_mem(llama_get_state_size(ctx)); + const size_t written = llama_copy_state_data(ctx, state_mem.data()); - { - FILE *fp_write = fopen("dump_state.bin", "wb"); - llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file - fwrite(state_mem.data(), 1, state_mem.size(), fp_write); - fclose(fp_write); - } + FILE *fp_write = fopen("dump_state.bin", "wb"); + fwrite(state_mem.data(), 1, written, fp_write); + fclose(fp_write); + + fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size()); } // save state (last tokens) @@ -100,18 +100,17 @@ int main(int argc, char ** argv) { std::vector state_mem(llama_get_state_size(ctx2)); FILE * fp_read = fopen("dump_state.bin", "rb"); + const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); + fclose(fp_read); - const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read); - if (ret != state_mem.size()) { + if (read != llama_set_state_data(ctx2, state_mem.data())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx2); llama_free_model(model); return 1; } - llama_set_state_data(ctx2, state_mem.data()); - - fclose(fp_read); + fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); } // restore state (last tokens) diff --git a/llama.cpp b/llama.cpp index 1d2eb569f01ff..2754560884e5f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9379,12 +9379,8 @@ struct llama_context * llama_new_context_with_model( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // resized during inference - if (params.logits_all) { - ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab); - } else { - ctx->logits.reserve(hparams.n_vocab); - } + // resized during inference, reserve maximum + ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); if (params.embedding){ ctx->embedding.resize(hparams.n_embd); @@ -9731,8 +9727,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) { // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_logits_capacity = sizeof(size_t); const size_t s_logits_size = sizeof(size_t); + // assume worst case for logits although only currently set ones are serialized const size_t s_logits = ctx->logits.capacity() * sizeof(float); const size_t s_embedding_size = sizeof(size_t); const size_t s_embedding = ctx->embedding.size() * sizeof(float); @@ -9743,7 +9739,6 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_total = ( + s_rng_size + s_rng - + s_logits_capacity + s_logits_size + s_logits + s_embedding_size @@ -9812,37 +9807,27 @@ struct llama_data_file_context : llama_data_context { static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // copy rng { - std::stringstream rng_ss; + std::ostringstream rng_ss; rng_ss << ctx->rng; - const size_t rng_size = rng_ss.str().size(); - char rng_buf[LLAMA_MAX_RNG_STATE]; + const std::string & rng_str = rng_ss.str(); + const size_t rng_size = rng_str.size(); - memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); - memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); + GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE); + data_ctx->write(&rng_size, sizeof(rng_size)); + data_ctx->write(rng_str.data(), rng_size); } // copy logits { - const size_t logits_cap = ctx->logits.capacity(); const size_t logits_size = ctx->logits.size(); - data_ctx->write(&logits_cap, sizeof(logits_cap)); data_ctx->write(&logits_size, sizeof(logits_size)); if (logits_size) { data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); } - - // If there is a gap between the size and the capacity, write padding - size_t padding_size = (logits_cap - logits_size) * sizeof(float); - if (padding_size > 0) { - std::vector padding(padding_size, 0); // Create a buffer filled with zeros - data_ctx->write(padding.data(), padding_size); - } } // copy embeddings @@ -9925,13 +9910,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set rng { size_t rng_size; - char rng_buf[LLAMA_MAX_RNG_STATE]; + memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; + GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - std::stringstream rng_ss; - rng_ss.str(std::string(&rng_buf[0], rng_size)); + std::string rng_str((char *)inp, rng_size); inp += rng_size; + + std::istringstream rng_ss(rng_str); rng_ss >> ctx->rng; GGML_ASSERT(!rng_ss.fail()); @@ -9939,20 +9924,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set logits { - size_t logits_cap; size_t logits_size; - memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - GGML_ASSERT(ctx->logits.capacity() == logits_cap); + GGML_ASSERT(ctx->logits.capacity() >= logits_size); if (logits_size) { ctx->logits.resize(logits_size); + memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); + inp += logits_size * sizeof(float); } - - inp += logits_cap * sizeof(float); } // set embeddings diff --git a/llama.h b/llama.h index 689e12d7ce092..01d6fafaa4b0d 100644 --- a/llama.h +++ b/llama.h @@ -43,7 +43,7 @@ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 3 +#define LLAMA_SESSION_VERSION 4 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. From 2d57de525541247132e354f561ff48775fba5d85 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 18:46:37 +0200 Subject: [PATCH 049/131] metal : disable log for loaded kernels (#4794) --- ggml-metal.m | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 6c28a7ee32d3f..57e4448278a31 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -398,9 +398,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ - GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ - (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ - (int) kernel->pipeline.threadExecutionWidth); \ if (error) { \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ From f172de03f11465dc6c5a0fc3a22f8ec254c6832c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 18:47:38 +0200 Subject: [PATCH 050/131] llama : fix detokenization of non-special added-tokens (#4916) Co-authored-by: goerch --- llama.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2754560884e5f..2190ea7aa92c2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10305,6 +10305,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_SPM: { + // NOTE: we accept all unsupported token types, + // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); @@ -10313,6 +10315,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); + } else if (llama_is_user_defined_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT if (length < 3) { return -3; @@ -10327,14 +10336,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } buf[0] = llama_token_to_byte(model->vocab, token); return 1; - } else { - // TODO: for now we accept all unsupported token types, - // suppressing them like CONTROL tokens. - // GGML_ASSERT(false); } break; } case LLAMA_VOCAB_TYPE_BPE: { + // NOTE: we accept all unsupported token types, + // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; result = llama_decode_text(result); @@ -10343,12 +10350,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); + } else if (llama_is_user_defined_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); } else if (llama_is_control_token(model->vocab, token)) { ; - } else { - // TODO: for now we accept all unsupported token types, - // suppressing them like CONTROL tokens. - // GGML_ASSERT(false); } break; } From 0ea069b87bd296c556824e57455433b6c0357340 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 19:31:26 +0200 Subject: [PATCH 051/131] server : fix prompt caching with system prompt (#4914) --- examples/server/server.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 79eacf828346f..93f99929880f6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1180,8 +1180,9 @@ struct llama_server_context return slot.images.size() > 0; } - void send_error(task_server& task, std::string error) + void send_error(task_server& task, const std::string &error) { + LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); std::unique_lock lock(mutex_results); task_result res; res.id = task.id; @@ -1570,12 +1571,22 @@ struct llama_server_context LOG_TEE("slot unavailable\n"); // send error result send_error(task, "slot unavailable"); - return; + break; } if (task.data.contains("system_prompt")) { + if (!all_slots_are_idle) { + send_error(task, "system prompt can only be updated when all slots are idle"); + break; + } process_system_prompt_data(task.data["system_prompt"]); + + // reset cache_tokens for all slots + for (llama_client_slot &slot : slots) + { + slot.cache_tokens.clear(); + } } slot->reset(); @@ -1652,8 +1663,7 @@ struct llama_server_context // attend tasks process_tasks(); - // update the system prompt wait until all slots are idle state - if (system_need_update && all_slots_are_idle) + if (system_need_update) { LOG_TEE("updating system prompt\n"); update_system_prompt(); From 4be5ef556de830c5c4f6e45c05ef4427823fe607 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 Jan 2024 20:45:45 +0200 Subject: [PATCH 052/131] metal : remove old API (#4919) ggml-ci --- Makefile | 9 -- examples/CMakeLists.txt | 3 - examples/metal/CMakeLists.txt | 4 - examples/metal/metal.cpp | 103 ------------- ggml-metal.h | 55 +------ ggml-metal.m | 276 +++------------------------------- llama.cpp | 4 +- 7 files changed, 27 insertions(+), 427 deletions(-) delete mode 100644 examples/metal/CMakeLists.txt delete mode 100644 examples/metal/metal.cpp diff --git a/Makefile b/Makefile index 05fe9a0f6a0d2..995b89f7adac9 100644 --- a/Makefile +++ b/Makefile @@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin) endif endif -ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' -BUILD_TARGETS += metal -endif - default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @@ -671,11 +667,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -ifdef LLAMA_METAL -metal: examples/metal/metal.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif - ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index fa127a3aa7c9e..f67d74c5530c9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,9 +37,6 @@ else() add_subdirectory(lookup) add_subdirectory(train-text-from-scratch) add_subdirectory(imatrix) - if (LLAMA_METAL) - add_subdirectory(metal) - endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt deleted file mode 100644 index f16d491655948..0000000000000 --- a/examples/metal/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -set(TEST_TARGET metal) -add_executable(${TEST_TARGET} metal.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TEST_TARGET} PRIVATE ggml) diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp deleted file mode 100644 index 16c1146f94e33..0000000000000 --- a/examples/metal/metal.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Evaluate a statically exported ggml computation graph with Metal -// -// - First, export a LLaMA graph: -// -// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export -// -// - Run this tool to evaluate the exported graph: -// -// $ ./bin/metal llama.ggml -// -// The purpose of this tool is mostly for debugging and demonstration purposes. -// The main limitation of exporting computation graphs is that their sizes are static which often -// can be a problem for real-world applications. -// - -#include "ggml.h" -#include "ggml-metal.h" - -#include -#include -#include - -int main(int argc, char ** argv) { - ggml_time_init(); - - if (argc != 2) { - fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]); - return -1; - } - - const char * fname_cgraph = argv[1]; - - // load the compute graph - struct ggml_context * ctx_data = NULL; - struct ggml_context * ctx_eval = NULL; - - struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - - // this allocates all Metal resources and memory buffers - auto * ctx_metal = ggml_metal_init(1); - - const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); - const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); - ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data); - ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval); - - // main - { - struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd"); - *(int32_t *) input->data = 1; // BOS - - ggml_metal_set_tensor(ctx_metal, input); - - // warmup - ggml_metal_graph_compute(ctx_metal, gf); - - const int n_iter = 16; - - const int64_t t0 = ggml_time_us(); - - // the actual inference happens here - for (int i = 0; i < n_iter; ++i) { - ggml_metal_graph_compute(ctx_metal, gf); - } - - const int64_t t1 = ggml_time_us(); - - printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter); - } - - // debug output - { - struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1]; - ggml_metal_get_tensor(ctx_metal, logits); - - float * ptr = (float *) ggml_get_data(logits); - - printf("logits: "); - for (int i = 0; i < 10; i++) { - printf("%8.4f ", ptr[i]); - } - printf("\n"); - int imax = 0; - double sum = 0.0; - double vmax = -1e9; - for (int i = 0; i < 32000; i++) { - sum += (double) ptr[i]; - if (ptr[i] > vmax) { - vmax = ptr[i]; - imax = i; - } - } - printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); - } - - ggml_metal_free(ctx_metal); - - ggml_free(ctx_data); - ggml_free(ctx_eval); - - return 0; -} - diff --git a/ggml-metal.h b/ggml-metal.h index c4b7325da6187..cd5e2995f66f6 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -36,64 +36,13 @@ struct ggml_cgraph; extern "C" { #endif -// -// internal API -// temporary exposed to user-code -// - -struct ggml_metal_context; - -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); - -// number of command buffers to use -struct ggml_metal_context * ggml_metal_init(int n_cb); -void ggml_metal_free(struct ggml_metal_context * ctx); - -void * ggml_metal_host_malloc(size_t n); -void ggml_metal_host_free (void * data); - -// set the number of command buffers to use -void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); - -// creates a mapping between a host memory buffer and a device memory buffer -// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute -// - the mapping is used during computation to determine the arguments of the compute kernels -// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal -// - max_size specifies the maximum size of a tensor and is used to create shared views such -// that it is guaranteed that the tensor will fit in at least one of the views -// -bool ggml_metal_add_buffer( - struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size); - -// set data from host memory into the device -void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// get data from the device into host memory -void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// try to find operations that can be run concurrently in the graph -// you should run it again if the topology of your graph changes -void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); - -// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized -int ggml_metal_if_optimized(struct ggml_metal_context * ctx); - -// output the concur_list for ggml_alloc -int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); - -// same as ggml_graph_compute but uses Metal -// creates gf->n_threads command buffers in parallel -bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); - // // backend API // user-code should use only these functions // +GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); diff --git a/ggml-metal.m b/ggml-metal.m index 57e4448278a31..cae52c9830cb2 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -24,8 +24,6 @@ #define UNUSED(x) (void)(x) -#define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE) - #define GGML_METAL_MAX_KERNELS 256 struct ggml_metal_buffer { @@ -182,9 +180,6 @@ struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS]; - int concur_list[GGML_MAX_CONCUR]; - int concur_list_len; - bool support_simdgroup_reduction; bool support_simdgroup_mm; }; @@ -200,7 +195,6 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end - static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { fprintf(stderr, "%s", msg); @@ -211,11 +205,6 @@ static void ggml_metal_default_log_callback(enum ggml_log_level level, const cha ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; void * ggml_metal_log_user_data = NULL; -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_metal_log_callback = log_callback; - ggml_metal_log_user_data = user_data; -} - GGML_ATTRIBUTE_FORMAT(2, 3) static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ if (ggml_metal_log_callback != NULL) { @@ -238,7 +227,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ } } -struct ggml_metal_context * ggml_metal_init(int n_cb) { +static void * ggml_metal_host_malloc(size_t n) { + void * data = NULL; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); + return NULL; + } + + return data; +} + +static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; @@ -264,7 +264,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; - ctx->concur_list_len = 0; ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); @@ -531,7 +530,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ return ctx; } -void ggml_metal_free(struct ggml_metal_context * ctx) { +static void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < ctx->n_buffers; ++i) { @@ -557,33 +556,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } -void * ggml_metal_host_malloc(size_t n) { - void * data = NULL; - const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); - if (result != 0) { - GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); - return NULL; - } - - return data; -} - -void ggml_metal_host_free(void * data) { - free(data); -} - -void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { - ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); -} - -int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { - return ctx->concur_list_len; -} - -int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { - return ctx->concur_list; -} - // temporarily defined here for compatibility between ggml-backend and the old API struct ggml_backend_metal_buffer { @@ -656,209 +628,6 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { return nil; } -bool ggml_metal_add_buffer( - struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size) { - if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); - return false; - } - - if (data) { - // verify that the buffer does not overlap with any of the existing buffers - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; - - if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); - return false; - } - } - - const size_t size_page = sysconf(_SC_PAGESIZE); - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= ctx->device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); - return false; - } - - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = ctx->device.maxBufferLength - size_ovlp; - const size_t size_view = ctx->device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); - return false; - } - - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); - if (i + size_step < size) { - GGML_METAL_LOG_INFO("\n"); - } - - ++ctx->n_buffers; - } - } - -#if TARGET_OS_OSX - GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", - ctx->device.currentAllocatedSize / 1024.0 / 1024.0, - ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - - if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); - } -#else - GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); -#endif - } - - return true; -} - -void ggml_metal_set_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - size_t offs; - id id_dst = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); -} - -void ggml_metal_get_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - size_t offs; - id id_src = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); -} - -void ggml_metal_graph_find_concurrency( - struct ggml_metal_context * ctx, - struct ggml_cgraph * gf, bool check_mem) { - int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time - int nodes_unused[GGML_MAX_CONCUR]; - - for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } - for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } - ctx->concur_list_len = 0; - - int n_left = gf->n_nodes; - int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list - int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos - - while (n_left > 0) { - // number of nodes at a layer (that can be issued concurrently) - int concurrency = 0; - for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { - if (nodes_unused[i]) { - // if the requirements for gf->nodes[i] are satisfied - int exe_flag = 1; - - // scan all srcs - for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { - struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; - if (src_cur) { - // if is leaf nodes it's satisfied. - // TODO: ggml_is_leaf() - if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) { - continue; - } - - // otherwise this src should be the output from previous nodes. - int is_found = 0; - - // scan 2*search_depth back because we inserted barrier. - //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { - for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { - if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { - is_found = 1; - break; - } - } - if (is_found == 0) { - exe_flag = 0; - break; - } - } - } - if (exe_flag && check_mem) { - // check if nodes[i]'s data will be overwritten by a node before nodes[i]. - // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] - int64_t data_start = (int64_t) gf->nodes[i]->data; - int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); - for (int j = n_start; j < i; j++) { - if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ - && gf->nodes[j]->op != GGML_OP_VIEW \ - && gf->nodes[j]->op != GGML_OP_TRANSPOSE \ - && gf->nodes[j]->op != GGML_OP_PERMUTE) { - if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ - ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { - continue; - } - - exe_flag = 0; - } - } - } - if (exe_flag) { - ctx->concur_list[level_pos + concurrency] = i; - nodes_unused[i] = 0; - concurrency++; - ctx->concur_list_len++; - } - } - } - n_left -= concurrency; - // adding a barrier different layer - ctx->concur_list[level_pos + concurrency] = -1; - ctx->concur_list_len++; - // jump all sorted nodes at nodes_bak - while (!nodes_unused[n_start]) { - n_start++; - } - level_pos += concurrency + 1; - } - - if (ctx->concur_list_len > GGML_MAX_CONCUR) { - GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); - } -} - static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: @@ -940,19 +709,15 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } } -bool ggml_metal_graph_compute( +static bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { - // if there is ctx->concur_list, dispatch concurrently - // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; - - const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; - edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; + const int n_nodes = gf->n_nodes; + edesc.dispatchType = MTLDispatchTypeSerial; // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel @@ -983,7 +748,7 @@ bool ggml_metal_graph_compute( const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); for (int ind = node_start; ind < node_end; ++ind) { - const int i = has_concur ? ctx->concur_list[ind] : ind; + const int i = ind; if (i == -1) { [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; @@ -2823,6 +2588,11 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .supports_op = */ ggml_backend_metal_supports_op, }; +void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { + ggml_metal_log_callback = log_callback; + ggml_metal_log_user_data = user_data; +} + ggml_backend_t ggml_backend_metal_init(void) { struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); @@ -2849,7 +2619,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_set_n_cb(ctx, n_cb); + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { diff --git a/llama.cpp b/llama.cpp index 2190ea7aa92c2..66494974abb6f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1266,7 +1266,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g struct llama_state { llama_state() { #ifdef GGML_USE_METAL - ggml_metal_log_set_callback(log_callback, log_callback_user_data); + ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); #endif } @@ -10470,7 +10470,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; #ifdef GGML_USE_METAL - ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); + ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif } From c71d608ce7a1584bf5072f197919dd24f3a6163f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 13 Jan 2024 21:41:37 +0100 Subject: [PATCH 053/131] ggml: cache sin/cos for RoPE (#4908) --- ggml.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/ggml.c b/ggml.c index de6ef34bdde59..bcfb6652c1032 100644 --- a/ggml.c +++ b/ggml.c @@ -11638,6 +11638,21 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); } +static void ggml_rope_cache_init( + float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, + float * cache, float sin_sign, float theta_scale +) { + float theta = theta_base; + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + rope_yarn( + theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] + ); + cache[i0 + 1] *= sin_sign; + + theta *= theta_scale; + } +} + void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { @@ -11720,6 +11735,12 @@ static void ggml_compute_forward_rope_f32( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox + ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + } + for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -11753,18 +11774,13 @@ static void ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta_base *= theta_scale; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11888,6 +11904,12 @@ static void ggml_compute_forward_rope_f16( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox + ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + } + for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -11921,13 +11943,8 @@ static void ggml_compute_forward_rope_f16( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; - - theta_base *= theta_scale; + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -16722,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } } break; case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; From 76484fbfd355df388f71d6edaa98e1692a74de7e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 00:14:46 +0200 Subject: [PATCH 054/131] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index edcdb530a270b..753d227a76a80 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -400c07f00508e6f60fb25405444b5669c365b0a9 +1890780da4ea10db88736fcde85f285abf6c64b0 From 807179ec583dcb882f97d9704577c06beb2c5ec9 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 14 Jan 2024 09:44:30 +0200 Subject: [PATCH 055/131] Make Q3_K_S be the same as olf Q3_K_L for Mixtral-8x7B (#4906) Co-authored-by: Iwan Kawrakow --- llama.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 66494974abb6f..8e20e72a23214 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8489,9 +8489,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + if (qs.model.hparams.n_expert == 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + new_type = GGML_TYPE_Q5_K; + } + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } From 147b17ac94a24d524e367cda26a9ff6245689f34 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 14 Jan 2024 09:45:56 +0200 Subject: [PATCH 056/131] 2-bit quantizations (#4897) * imatrix: load * imatrix: WIP * imatrix: Add Q2_K quantization * imatrix: also guard against Q2_K_S quantization without importance matrix * imatrix: guard even more against low-bit quantization misuse --------- Co-authored-by: Iwan Kawrakow --- examples/benchmark/benchmark-matmult.cpp | 4 +- examples/quantize/quantize.cpp | 133 +++- ggml-quants.c | 950 +++++++++++++++++++++-- ggml-quants.h | 12 +- ggml.c | 36 +- ggml.h | 9 +- llama.cpp | 84 +- llama.h | 1 + tests/test-backend-ops.cpp | 2 +- 9 files changed, 1149 insertions(+), 82 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 434e1d6bd509e..e89f3de2fd397 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -194,7 +194,7 @@ int main(int argc, char ** argv) { // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -207,7 +207,7 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f878f6911420a..f4e2175f18612 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -5,6 +5,10 @@ #include #include #include +#include +#include +#include +#include struct quant_option { std::string name; @@ -17,6 +21,8 @@ static const std::vector QUANT_OPTIONS = { { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, + { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, + { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, @@ -72,10 +78,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n"); + printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { @@ -83,11 +93,93 @@ static void usage(const char * executable) { } else { printf(" "); } - printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); + printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } +static void load_imatrix(const std::string& imatrix_file, std::unordered_map>& imatrix_data) { + std::ifstream in(imatrix_file.c_str(), std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__,imatrix_file.c_str()); + return; + } + int n_entries; + in.read((char*)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); + return; + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str()); + return; + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto& e = imatrix_data[std::move(name)]; + int ncall; + in.read((char*)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n",__func__,i); + imatrix_data = {}; + return; + } + e.resize(nval); + in.read((char*)e.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n",__func__,i); + imatrix_data = {}; + return; + } + if (ncall > 0) { + for (auto& v : e) v /= ncall; + } + } + printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str()); +} + +static void prepare_imatrix(const std::string& imatrix_file, + const std::vector& included_weights, + const std::vector& excluded_weights, + std::unordered_map>& imatrix_data) { + if (!imatrix_file.empty()) { + load_imatrix(imatrix_file, imatrix_data); + } + if (imatrix_data.empty()) { + return; + } + if (!excluded_weights.empty()) { + for (auto& name : excluded_weights) { + for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) { + auto pos = it->first.find(name); + if (pos != std::string::npos) it = imatrix_data.erase(it); + else ++it; + } + } + } + if (!included_weights.empty()) { + std::unordered_map> tmp; + for (auto& name : included_weights) { + for (auto& e : imatrix_data) { + auto pos = e.first.find(name); + if (pos != std::string::npos) { + tmp.emplace(std::move(e)); + } + } + } + imatrix_data = std::move(tmp); + } + if (!imatrix_data.empty()) { + printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); + } +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -96,6 +188,8 @@ int main(int argc, char ** argv) { llama_model_quantize_params params = llama_model_quantize_default_params(); int arg_idx = 1; + std::string imatrix_file; + std::vector included_weights, excluded_weights; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -104,15 +198,43 @@ int main(int argc, char ** argv) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { params.pure = true; + } else if (strcmp(argv[arg_idx], "--imatrix") == 0) { + if (arg_idx < argc-1) { + imatrix_file = argv[++arg_idx]; + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--include-weights") == 0) { + if (arg_idx < argc-1) { + included_weights.push_back(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) { + if (arg_idx < argc-1) { + excluded_weights.push_back(argv[++arg_idx]); + } else { + usage(argv[0]); + } } else { usage(argv[0]); } } if (argc - arg_idx < 2) { + printf("%s: bad arguments\n", argv[0]); + usage(argv[0]); + } + if (!included_weights.empty() && !excluded_weights.empty()) { usage(argv[0]); } + std::unordered_map> imatrix_data; + prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); + if (!imatrix_data.empty()) { + params.imatrix = &imatrix_data; + } + llama_backend_init(false); // parse command line arguments @@ -163,6 +285,13 @@ int main(int argc, char ** argv) { } } + if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) { + fprintf(stderr, "\n===============================================================================================\n"); + fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); + fprintf(stderr, "===============================================================================================\n\n\n"); + return 1; + } + print_build_info(); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); diff --git a/ggml-quants.c b/ggml-quants.c index 601d155d73696..9290d54cfba7a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,6 +5,8 @@ #include #include #include +#include // for qsort +#include // for GGML_ASSERT #ifdef __ARM_NEON @@ -1639,6 +1641,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n return (n/QK_K*sizeof(block_q2_K)); } +static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights ? weights[0] : x[0]*x[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights ? weights[i] : x[i]*x[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) { + min = 0; + } + if (max <= min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff*diff; + float w = weights ? weights[i] : x[i]*x[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights ? weights[i] : x[i]*x[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff*diff; + float w = weights ? weights[i] : x[i]*x[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + +static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) { + float max = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + } + if (!max) { // all zero + for (int i = 0; i < n; ++i) { L[i] = 0; } + return 0.f; + } + float iscale = nmax / max; + for (int i = 0; i < n; ++i) { + L[i] = nearest_int(iscale * x[i]); + } + float scale = 1/iscale; + float best_mse = 0; + for (int i = 0; i < n; ++i) { + float diff = x[i] - scale*L[i]; + float w = quant_weights[i]; + best_mse += w*diff*diff; + } + for (int is = -4; is <= 4; ++is) { + if (is == 0) continue; + float iscale_is = (0.1f*is + nmax)/max; + float scale_is = 1/iscale_is; + float mse = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale_is*x[i]); + l = MIN(nmax, l); + float diff = x[i] - scale_is*l; + float w = quant_weights[i]; + mse += w*diff*diff; + } + if (mse < best_mse) { + best_mse = mse; + iscale = iscale_is; + } + } + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MIN(nmax, l); + L[i] = l; + float w = quant_weights[i]; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + for (int itry = 0; itry < 5; ++itry) { + int n_changed = 0; + for (int i = 0; i < n; ++i) { + float w = quant_weights[i]; + float slx = sumlx - w*x[i]*L[i]; + float sl2 = suml2 - w*L[i]*L[i]; + if (slx > 0 && sl2 > 0) { + int new_l = nearest_int(x[i] * sl2 / slx); + new_l = MIN(nmax, new_l); + if (new_l != L[i]) { + slx += w*x[i]*new_l; + sl2 += w*new_l*new_l; + if (slx*slx*suml2 > sumlx*sumlx*sl2) { + L[i] = new_l; sumlx = slx; suml2 = sl2; + ++n_changed; + } + } + } + } + if (!n_changed) { + break; + } + } + return sumlx / suml2; +} + +static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) { + GGML_ASSERT(quant_weights); + assert(k % QK_K == 0); + const int nb = k / QK_K; + const bool requantize = true; + + uint8_t L[QK_K]; + uint8_t Laux[16]; + float mins[QK_K/16]; + float scales[QK_K/16]; + float sw[QK_K/16]; + float weight[QK_K/16]; + uint8_t Ls[QK_K/16], Lm[QK_K/16]; + + for (int i = 0; i < nb; i++) { + memset(sw, 0, QK_K/16*sizeof(float)); + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j]; + float sigma2 = sumx2/QK_K; + for (int j = 0; j < QK_K/16; ++j) { + const float * restrict qw = quant_weights + QK_K * i + 16*j; + for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); + for (int l = 0; l < 16; ++l) sw[j] += weight[l]; + scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + } + + float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); + float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); + y[i].d = GGML_FP32_TO_FP16(dm); + y[i].dmin = GGML_FP32_TO_FP16(mm); + dm = GGML_FP16_TO_FP32(y[i].d); + mm = GGML_FP16_TO_FP32(y[i].dmin); + + for (int j = 0; j < QK_K/16; ++j) { + y[i].scales[j] = Ls[j] | (Lm[j] << 4); + } + + if (requantize) { + for (int j = 0; j < QK_K/16; ++j) { + const float d = dm * (y[i].scales[j] & 0xF); + if (!d) continue; + const float m = mm * (y[i].scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int((x[16*j + ii] + m)/d); + l = MAX(0, MIN(3, l)); + L[16*j + ii] = l; + } + } + } + +#if QK_K == 256 + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif + + x += QK_K; + + } +} + +size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); + if (!quant_weights) { + quantize_row_q2_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + //========================= 3-bit (de)-quantization void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { @@ -2584,14 +2821,6 @@ static const uint8_t ksigns_iq2xs[128] = { static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; -void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) { - (void)x; - (void)y; - (void)k; - assert(k % QK_K == 0); - //fprintf(stderr, "=========================== %s: not implemented\n", __func__); -} - void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2618,33 +2847,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y } } -void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) { - assert(k % QK_K == 0); - block_iq2_xxs * restrict y = vy; - quantize_row_iq2_xxs_reference(x, y, k); -} - -size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K; - quantize_row_iq2_xxs_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_iq2_xxs)); -} - // ====================== 2.3125 bpw (de)-quantization -void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) { - (void)x; - (void)y; - (void)k; - assert(k % QK_K == 0); - //fprintf(stderr, "=========================== %s: not implemented\n", __func__); -} - void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2670,23 +2874,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, } } -void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) { - assert(k % QK_K == 0); - block_iq2_xs * restrict y = vy; - quantize_row_iq2_xs_reference(x, y, k); -} - -size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K; - quantize_row_iq2_xs_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_iq2_xs)); -} - //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -7730,3 +7917,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest *s = 0.125f * sumf; #endif } + +// ================================ IQ2 quantization ============================================= + +typedef struct { + uint64_t * grid; + int * map; + uint16_t * neighbours; +} iq2_entry_t; + +static iq2_entry_t iq2_data[2] = { + {NULL, NULL, NULL}, + {NULL, NULL, NULL}, +}; + +static inline int iq2_data_index(int grid_size) { + GGML_ASSERT(grid_size == 256 || grid_size == 512); + return grid_size == 256 ? 0 : 1; +} + +static int iq2_compare_func(const void * left, const void * right) { + const int * l = (const int *)left; + const int * r = (const int *)right; + return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; +} + +static void q2xs_init_impl(int grid_size) { + const int gindex = iq2_data_index(grid_size); + if (iq2_data[gindex].grid) { + return; + } + static const uint16_t kgrid_256[256] = { + 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97, + 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642, + 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288, + 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113, + 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240, + 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400, + 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260, + 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872, + 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516, + 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561, + 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488, + 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545, + 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874, + 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856, + 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142, + 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268, + }; + static const uint16_t kgrid_512[512] = { + 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, + 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257, + 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340, + 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597, + 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096, + 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348, + 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065, + 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441, + 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160, + 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372, + 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125, + 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652, + 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197, + 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549, + 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894, + 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388, + 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480, + 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773, + 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473, + 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436, + 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497, + 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162, + 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528, + 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745, + 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234, + 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025, + 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810, + 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984, + 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462, + 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960, + 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048, + 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690, + }; + const int kmap_size = 43692; + const int nwant = 2; + const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + uint64_t * kgrid_q2xs; + int * kmap_q2xs; + uint16_t * kneighbors_q2xs; + + printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t)); + for (int k = 0; k < grid_size; ++k) { + int8_t * pos = (int8_t *)(the_grid + k); + for (int i = 0; i < 8; ++i) { + int l = (kgrid[k] >> 2*i) & 0x3; + pos[i] = 2*l + 1; + } + } + kgrid_q2xs = the_grid; + iq2_data[gindex].grid = the_grid; + kmap_q2xs = (int *)malloc(kmap_size*sizeof(int)); + iq2_data[gindex].map = kmap_q2xs; + for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1; + uint64_t aux64; + uint8_t * aux8 = (uint8_t *)&aux64; + for (int i = 0; i < grid_size; ++i) { + aux64 = kgrid_q2xs[i]; + uint16_t index = 0; + for (int k=0; k<8; ++k) { + uint16_t q = (aux8[k] - 1)/2; + index |= (q << 2*k); + } + kmap_q2xs[index] = i; + } + int8_t pos[8]; + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + int num_neighbors = 0, num_not_in_map = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + ++num_not_in_map; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + num_neighbors += n; + } + printf("%s: %d neighbours in total\n", __func__, num_neighbors); + kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); + iq2_data[gindex].neighbours = kneighbors_q2xs; + int counter = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + kmap_q2xs[i] = -(counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q2xs[counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q2xs[counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); +} + +void ggml_init_iq2_quantization(enum ggml_type type) { + if (type == GGML_TYPE_IQ2_XXS) { + q2xs_init_impl(256); + } + else if (type == GGML_TYPE_IQ2_XS) { + q2xs_init_impl(512); + } + else { + fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type); + } +} + +static void q2xs_deinit_impl(int grid_size) { + GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024); + const int gindex = iq2_data_index(grid_size); + if (iq2_data[gindex].grid) { + free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; + free(iq2_data[gindex].map); iq2_data[gindex].map = NULL; + free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL; + } +} + +void ggml_deinit_iq2_quantization(enum ggml_type type) { + if (type == GGML_TYPE_IQ2_XXS) { + q2xs_deinit_impl(256); + } + else if (type == GGML_TYPE_IQ2_XS) { + q2xs_deinit_impl(512); + } + else { + fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type); + } +} + +static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, + const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { + int num_neighbors = neighbours[0]; + GGML_ASSERT(num_neighbors > 0); + float best_d2 = FLT_MAX; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float d2 = 0; + for (int i = 0; i < 8; ++i) { + float q = pg[i]; + float diff = scale*q - xval[i]; + d2 += weight[i]*diff*diff; + } + if (d2 < best_d2) { + best_d2 = d2; grid_index = neighbours[j]; + } + } + GGML_ASSERT(grid_index >= 0); + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(256); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(quant_weights); + GGML_ASSERT(kgrid_q2xs); + GGML_ASSERT(kmap_q2xs); + GGML_ASSERT(kneighbors_q2xs); + GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 3; + + const int nbl = n/256; + + block_iq2_xxs * y = vy; + + float scales[QK_K/32]; + float weight[32]; + float xval[32]; + int8_t L[32]; + int8_t Laux[32]; + float waux[32]; + bool is_on_grid[4]; + bool is_on_grid_aux[4]; + uint8_t block_signs[4]; + uint32_t q2[2*(QK_K/32)]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = GGML_FP32_TO_FP16(0.f); + memset(q2, 0, QK_K/4); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const float * xb = xbl + 32*ib; + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 4; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 32); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -9; is <= 9; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/max; + float this_scale = 1/id; + for (int k = 0; k < 4; ++k) { + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 32; ++i) L[i] = Laux[i]; + for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 4; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 2*i); + } + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index); + for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 4; ++k) { + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); + printf("\n"); + GGML_ASSERT(false); + } + q2[2*ib+0] |= (grid_index << 8*k); + q2[2*ib+1] |= (block_signs[k] << 7*k); + } + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/4); + continue; + } + + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + q2[2*ib+1] |= ((uint32_t)l << 28); + const float * xb = xbl + 32*ib; + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib); + const float db = d * (1 + 2*l); + uint32_t u = 0; + for (int k = 0; k < 4; ++k) { + const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127); + const float * xk = xb + 8*k; + const float * wk = weight + 8*k; + const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); + float best_mse = 0; int best_index = aux8[k]; + for (int j = 0; j < 8; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + best_mse += wk[j] * diff * diff; + } + for (int idx = 0; idx < 256; ++idx) { + grid = (const uint8_t *)(kgrid_q2xs + idx); + float mse = 0; + for (int j = 0; j < 8; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + mse += wk[j] * diff * diff; + } + if (mse < best_mse) { + best_mse = mse; best_index = idx; + } + } + u |= (best_index << 8*k); + grid = (const uint8_t *)(kgrid_q2xs + best_index); + //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); + for (int j = 0; j < 8; ++j) { + float q = db * grid[j] * signs[j]; + sumqx += wk[j] * q * xk[j]; + sumq2 += wk[j] * q * q; + } + } + q2[2*ib] = u; + if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); + } + memcpy(y[ibl].qs, q2, QK_K/4); + } +} + +static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(512); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(quant_weights); + GGML_ASSERT(kmap_q2xs); + GGML_ASSERT(kgrid_q2xs); + GGML_ASSERT(kneighbors_q2xs); + GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 3; + + const int nbl = n/256; + + block_iq2_xs * y = vy; + + float scales[QK_K/16]; + float weight[16]; + float xval[16]; + int8_t L[16]; + int8_t Laux[16]; + float waux[16]; + bool is_on_grid[2]; + bool is_on_grid_aux[2]; + uint8_t block_signs[2]; + uint16_t q2[2*(QK_K/16)]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = GGML_FP32_TO_FP16(0.f); + memset(q2, 0, QK_K/4); + memset(y[ibl].scales, 0, QK_K/32); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float * xb = xbl + 16*ib; + const float * qw = quant_weights + QK_K*ibl + 16*ib; + for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 2; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 16); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + is_on_grid[0] = is_on_grid[1] = true; + for (int is = -9; is <= 9; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/max; + float this_scale = 1/id; + for (int k = 0; k < 2; ++k) { + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 16; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 16; ++i) L[i] = Laux[i]; + for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 2; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 2*i); + L[8*k + i] = l; + } + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 16; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + scale = -scale; + for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 2; ++k) { + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); + printf("\n"); + GGML_ASSERT(false); + } + q2[2*ib+k] = grid_index | (block_signs[k] << 9); + } + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/4); + continue; + } + + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = 1/d; + for (int ib = 0; ib < QK_K/16; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + if (ib%2 == 0) y[ibl].scales[ib/2] = l; + else y[ibl].scales[ib/2] |= (l << 4); + } + memcpy(y[ibl].qs, q2, QK_K/4); + + } +} + +size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq2_xxs); + } + return nrow * nblock * sizeof(block_iq2_xxs); +} + +size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq2_xs); + } + return nrow * nblock * sizeof(block_iq2_xs); +} + diff --git a/ggml-quants.h b/ggml-quants.h index df5e7ae807f5f..e5d1102304ba5 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); -void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k); -void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k); void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); @@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); -void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k); -void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); @@ -246,3 +242,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +// +// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") +// +size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); + diff --git a/ggml.c b/ggml.c index bcfb6652c1032..52467475a1f22 100644 --- a/ggml.c +++ b/ggml.c @@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xxs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, - .from_float = quantize_row_iq2_xxs, - .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference, + .from_float = NULL, + .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, @@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, - .from_float = quantize_row_iq2_xs, - .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference, + .from_float = NULL, + .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, @@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, + int nrows, int n_per_row, int64_t * hist, const float * imatrix) { + (void)imatrix; size_t result = 0; + int n = nrows * n_per_row; switch (type) { case GGML_TYPE_Q4_0: { @@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_K * block = (block_q2_K*)dst + start / QK_K; - result = ggml_quantize_q2_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q3_K: { @@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_IQ2_XXS: { GGML_ASSERT(start % QK_K == 0); - block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K; - result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_IQ2_XS: { GGML_ASSERT(start % QK_K == 0); - block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K; - result = ggml_quantize_iq2_xs(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_F16: { diff --git a/ggml.h b/ggml.h index b18ba78120ca6..1187074f7f174 100644 --- a/ggml.h +++ b/ggml.h @@ -2067,10 +2067,13 @@ extern "C" { GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, + int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix); + + // These are needed for IQ2_XS and IQ2_XXS quantizations + GGML_API void ggml_init_iq2_quantization(enum ggml_type type); + GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type); // // Importance matrix diff --git a/llama.cpp b/llama.cpp index 8e20e72a23214..107b051147d3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8429,9 +8429,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = GGML_TYPE_Q5_K; + } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (name.find("attn_v.weight") != std::string::npos) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q2_K; + ++qs.i_attention_wv; + } + else if (name.find("ffn_down") != std::string::npos) { + if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K; + ++qs.i_feed_forward_w2; + } + else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -8601,6 +8615,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->only_copy) { ftype = model.ftype; } + const std::unordered_map> * imatrix_data = nullptr; + if (params->imatrix) { + imatrix_data = static_cast>*>(params->imatrix); + if (imatrix_data) { + printf("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + } + } const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -8658,6 +8679,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // placeholder for the meta data ::zeros(fout, meta_size); + std::set used_iq2; + for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * tensor = ml.get_tensor_meta(i); @@ -8710,6 +8733,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { const size_t nelements = ggml_nelements(tensor); + if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) { + ggml_init_iq2_quantization(new_type); + used_iq2.insert(new_type); + } + + const float * imatrix = nullptr; + if (imatrix_data) { + auto it = imatrix_data->find(tensor->name); + if (it == imatrix_data->end()) { + printf("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + } else { + if (it->second.size() == (size_t)tensor->ne[0]) { + imatrix = it->second.data(); + } else { + printf("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, + int(it->second.size()), int(tensor->ne[0]), tensor->name); + } + } + } + if ((new_type == GGML_TYPE_IQ2_XXS || + new_type == GGML_TYPE_IQ2_XS || + (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { + fprintf(stderr, "\n\n============================================================\n"); + fprintf(stderr, "Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); + fprintf(stderr, "The result will be garbage, so bailing out\n"); + fprintf(stderr, "============================================================\n\n"); + throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); + } + float * f32_data; if (tensor->type == GGML_TYPE_F32) { @@ -8730,21 +8782,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.data(); std::array hist_cur = {}; - static const int chunk_size = 32 * 512; + const int n_per_row = tensor->ne[0]; + const int nrows = nelements / n_per_row; + + static const int min_chunk_size = 32 * 512; + const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); + const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix); } else { - size_t counter = 0; + int counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, + nrows, n_per_row, imatrix]() { std::array local_hist = {}; + const int nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { std::unique_lock lock(mutex); - size_t first = counter; counter += chunk_size; - if (first >= nelements) { + int first_row = counter; counter += nrows_per_chunk; + if (first_row >= nrows) { if (local_size > 0) { for (int j=0; j %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -8774,6 +8834,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if (tot_count > 0) { + LLAMA_LOG_INFO(" | hist: "); for (size_t i = 0; i < hist_cur.size(); i++) { LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); } @@ -8802,6 +8863,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s fout.close(); + for (auto type : used_iq2) { + ggml_deinit_iq2_quantization(type); + } + gguf_free(ctx_out); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); @@ -9166,6 +9231,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, /*.pure =*/ false, + /*.imatrix =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 01d6fafaa4b0d..79c8335b66bdf 100644 --- a/llama.h +++ b/llama.h @@ -249,6 +249,7 @@ extern "C" { bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // disable k-quant mixtures and quantize all tensors to the same type + void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; // grammar types diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d9b8b106a6033..22a7856d46f41 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -56,7 +56,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); int64_t hist[16]; - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. From ac32902a87147f78d63c931aa8a23dee762660e7 Mon Sep 17 00:00:00 2001 From: Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> Date: Sun, 14 Jan 2024 00:41:44 -0800 Subject: [PATCH 057/131] llama : support WinXP build with MinGW 8.1.0 (#3419) --- CMakeLists.txt | 8 ++++++-- llama.cpp | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 668669c6da7e3..2741568ed3430 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) # for add_link_options +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. project("llama.cpp" C CXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -76,6 +76,10 @@ if (NOT MSVC) option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) endif() +if (WIN32) + option(LLAMA_WIN_VER "llama: Windows Version" 0x602) +endif() + # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) @@ -686,7 +690,7 @@ endif() if (MINGW) # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=0x602) + add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) endif() # diff --git a/llama.cpp b/llama.cpp index 107b051147d3f..51e9bdaed451f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -987,6 +987,7 @@ struct llama_mmap { } if (prefetch > 0) { +#if _WIN32_WINNT >= 0x602 // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); @@ -1004,6 +1005,9 @@ struct llama_mmap { llama_format_win_err(GetLastError()).c_str()); } } +#else + throw std::runtime_error("PrefetchVirtualMemory unavailable"); +#endif } } From 5f5fe1bd608fa2ed42af97b5f2ea31be6625fc48 Mon Sep 17 00:00:00 2001 From: Alex Azarov Date: Sun, 14 Jan 2024 09:44:39 +0100 Subject: [PATCH 058/131] metal : correctly set SIMD support flags on iOS (#4923) * Correctly set support_simdgroup_reduction and support_simdgroup_mm on iPhone/iPad * log a little bit more info on iOS --- ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index cae52c9830cb2..2ca726055f9ea 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -330,7 +330,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ } } -#if TARGET_OS_OSX // print MTL GPU family: GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); @@ -370,6 +369,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); +#if TARGET_OS_OSX GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); if (ctx->device.maxTransferRate != 0) { GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); From a128c38de862431f1aae9ccc40b792fbc1b8b682 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 14 Jan 2024 10:53:39 +0200 Subject: [PATCH 059/131] Fix ffn_down quantization mix for MoE models (#4927) * Fix ffn_down quantization mix for MoE models In #4872 I did not consider the part where every third tensor is quantized with more bits. Fir MoE this leads to tensors of the same layer being quantized with different number of bits, which is not considered as a possibility in the inference implementation (it is assumed all experts use the same quantization). * Fix the fix * Review suggestion --------- Co-authored-by: Iwan Kawrakow --- llama.cpp | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 51e9bdaed451f..b1d6015e2132e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8480,13 +8480,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = GGML_TYPE_Q8_0; } } else if (name.find("ffn_down") != std::string::npos) { + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + int i_layer, n_layer; + if (n_expert == 1) { + i_layer = qs.i_feed_forward_w2; + n_layer = qs.n_feed_forward_w2; + } else { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer = qs.n_feed_forward_w2 / n_expert; + if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); + } + } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K; + if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -8494,14 +8512,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { - new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K : - use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; } ++qs.i_feed_forward_w2; From 03c526749041c863b0cd842b26b8907e1ea0e0b1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 11:03:19 +0200 Subject: [PATCH 060/131] llama : use LLAMA_LOG_ macros for logging --- llama.cpp | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/llama.cpp b/llama.cpp index b1d6015e2132e..51821965e1b47 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1114,7 +1114,7 @@ struct llama_mlock { suggest = false; } - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } @@ -1123,7 +1123,7 @@ struct llama_mlock { static void raw_unlock(void * addr, size_t size) { if (munlock(addr, size)) { - fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno)); } } #elif defined(_WIN32) @@ -1141,7 +1141,7 @@ struct llama_mlock { return true; } if (tries == 2) { - fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", len, size, llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1150,7 +1150,7 @@ struct llama_mlock { // set size and try again. SIZE_T min_ws_size, max_ws_size; if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { - fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1163,7 +1163,7 @@ struct llama_mlock { min_ws_size += increment; max_ws_size += increment; if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { - fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1172,7 +1172,7 @@ struct llama_mlock { static void raw_unlock(void * ptr, size_t len) { if (!VirtualUnlock(ptr, len)) { - fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n", llama_format_win_err(GetLastError()).c_str()); } } @@ -1184,7 +1184,7 @@ struct llama_mlock { } bool raw_lock(const void * addr, size_t len) const { - fprintf(stderr, "warning: mlock not supported on this system\n"); + LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); return false; } @@ -2085,13 +2085,13 @@ namespace GGUFMeta { __func__, override_type_to_str(override->tag), override->key); switch (override->tag) { case LLAMA_KV_OVERRIDE_BOOL: { - printf("%s\n", override->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_INT: { - printf("%" PRId64 "\n", override->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); } break; case LLAMA_KV_OVERRIDE_FLOAT: { - printf("%.6f\n", override->float_value); + LLAMA_LOG_INFO("%.6f\n", override->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... @@ -6993,7 +6993,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break; #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif auto source = std::distance(buffer.begin(), it); @@ -7006,7 +7006,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); + LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif it++; } @@ -7022,7 +7022,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); + LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif it++; @@ -7038,7 +7038,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< raw_text_base_length = right_reminder_length; #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif } else { if (source == 0) { @@ -7095,7 +7095,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); @@ -7116,7 +7116,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); @@ -8641,7 +8641,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->imatrix) { imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { - printf("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); } } @@ -8764,12 +8764,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (imatrix_data) { auto it = imatrix_data->find(tensor->name); if (it == imatrix_data->end()) { - printf("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]) { imatrix = it->second.data(); } else { - printf("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, + LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, int(it->second.size()), int(tensor->ne[0]), tensor->name); } } @@ -8777,10 +8777,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { - fprintf(stderr, "\n\n============================================================\n"); - fprintf(stderr, "Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); - fprintf(stderr, "The result will be garbage, so bailing out\n"); - fprintf(stderr, "============================================================\n\n"); + LLAMA_LOG_ERROR("\n\n============================================================\n"); + LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); + LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); + LLAMA_LOG_ERROR("============================================================\n\n"); throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); } From 9408cfdad6b1c090a7e1419d4434edc260b7e47e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 11:08:09 +0200 Subject: [PATCH 061/131] scripts : sync-ggml-am.sh option to skip commits --- scripts/sync-ggml-am.sh | 14 +++++++++++++- scripts/sync-ggml.last | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 248cf10235e36..6b2514a11905e 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -5,7 +5,7 @@ # Usage: # # $ cd /path/to/llama.cpp -# $ ./scripts/sync-ggml-am.sh +# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... # set -e @@ -24,6 +24,11 @@ fi lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last) echo "Syncing ggml changes since commit $lc" +to_skip="" +if [ "$1" == "-skip" ]; then + to_skip=$2 +fi + cd $SRC_GGML git log --oneline $lc..HEAD @@ -40,6 +45,13 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then fi while read c; do + if [ -n "$to_skip" ]; then + if [[ $to_skip == *"$c"* ]]; then + echo "Skipping $c" + continue + fi + fi + git format-patch -k $c~1..$c --stdout -- \ include/ggml/ggml*.h \ src/ggml*.h \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 753d227a76a80..be9e408fbeb39 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -1890780da4ea10db88736fcde85f285abf6c64b0 +b306d6e996ec0ace77118fa5098822cdc7f9c88f From bb0c1392479398f9aba86d9ec98db0b95ede6e6d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 13:26:53 +0200 Subject: [PATCH 062/131] llama : check LLAMA_TRACE env for extra logging (#4929) * llama : minor fix indent * llama : check LLAMA_TRACE env for extra logging ggml-ci --- llama.cpp | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index 51821965e1b47..63f37ecdb8511 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2190,6 +2190,11 @@ struct llama_model_loader { LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { + int trace = 0; + if (getenv("LLAMA_TRACE")) { + trace = atoi(getenv("LLAMA_TRACE")); + } + struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx_meta, @@ -2242,11 +2247,10 @@ struct llama_model_loader { type_max = type; } - // TODO: make runtime configurable -#if 0 - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); -#endif + if (trace > 0) { + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + } } switch (type_max) { @@ -6451,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { - case LLAMA_VOCAB_TYPE_SPM: { - const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); - } - case LLAMA_VOCAB_TYPE_BPE: { - return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); - } - default: - GGML_ASSERT(false); + case LLAMA_VOCAB_TYPE_SPM: { + const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; + return vocab.token_to_id.at(buf); + } + case LLAMA_VOCAB_TYPE_BPE: { + return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); + } + default: + GGML_ASSERT(false); } } From 467a882fd2e5b6172897b49aa45aa29bd3f27685 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 14 Jan 2024 16:21:12 +0200 Subject: [PATCH 063/131] Add ability to use importance matrix for all k-quants (#4930) Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 2 +- ggml-quants.c | 443 ++++++++++++++++++++++++++++++++- ggml-quants.h | 5 +- ggml.c | 28 ++- 4 files changed, 462 insertions(+), 16 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f4e2175f18612..2ae0469331353 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -82,7 +82,7 @@ static void usage(const char * executable) { printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); - printf(" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n"); + printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); diff --git a/ggml-quants.c b/ggml-quants.c index 9290d54cfba7a..0750fe1bb27f1 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1244,7 +1244,8 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type, + const float * restrict qw) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1270,14 +1271,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * rmse_type = -rmse_type; return_early = true; } - int weight_type = rmse_type%2; float sumlx = 0; float suml2 = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); l = MAX(-nmax, MIN(nmax-1, l)); L[i] = l + nmax; - float w = weight_type == 1 ? x[i] * x[i] : 1; + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); sumlx += w*x[i]*l; suml2 += w*l*l; } @@ -1293,7 +1293,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); l = MAX(-nmax, MIN(nmax-1, l)); - float w = weight_type == 1 ? x[i] * x[i] : 1; + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); sumlx += w*x[i]*l; suml2 += w*l*l; } @@ -2089,6 +2089,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n return (n/QK_K*sizeof(block_q3_K)); } +static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q3_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + int8_t L[QK_K]; + float scales[QK_K / 16]; + float weight[16]; + float sw[QK_K / 16]; + int8_t Ls[QK_K / 16]; + + for (int i = 0; i < nb; i++) { + + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j]; + float sigma2 = 2*sumx2/QK_K; + + for (int j = 0; j < QK_K/16; ++j) { + if (quant_weights) { + const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL; + for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]); + } else { + for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l]; + } + float sumw = 0; + for (int l = 0; l < 16; ++l) sumw += weight[l]; + sw[j] = sumw; + + scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); + + } + + memset(y[i].scales, 0, 12); + + float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); + for (int j = 0; j < QK_K/16; ++j) { + int l = Ls[j]; + if (j < 8) { + y[i].scales[j] = l & 0xF; + } else { + y[i].scales[j-8] |= ((l & 0xF) << 4); + } + l >>= 4; + y[i].scales[j%4 + 8] |= (l << (2*(j/4))); + } + y[i].d = GGML_FP32_TO_FP16(d_block); + + int8_t sc; + for (int j = 0; j < QK_K/16; ++j) { + sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; + sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; + float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + L[16*j + ii] = l + 4; + } + } + + memset(y[i].hmask, 0, QK_K/8); + // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. + int m = 0; + uint8_t hm = 1; + for (int j = 0; j < QK_K; ++j) { + if (L[j] > 3) { + y[i].hmask[m] |= hm; + L[j] -= 4; + } + if (++m == QK_K/8) { + m = 0; hm <<= 1; + } + } + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + + x += QK_K; + } +#endif +} + +size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); + if (!quant_weights) { + quantize_row_q3_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== 4-bit (de)-quantization void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { @@ -2254,6 +2360,108 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n return (n/QK_K*sizeof(block_q4_K)); } +static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q4_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + uint8_t L[QK_K]; + uint8_t Laux[32]; + float weights[32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + + for (int i = 0; i < nb; i++) { + + float sum_x2 = 0; + for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; + float sigma2 = sum_x2/QK_K; + float av_x = sqrtf(sigma2); + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 32*j; + for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]); + } else { + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + } + scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(15, l)); + L[32*j + ii] = l; + } + } + uint8_t * q = y[i].qs; + for (int j = 0; j < QK_K; j += 64) { + for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); + q += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); + if (!quant_weights) { + quantize_row_q4_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== 5-bit (de)-quantization void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { @@ -2349,7 +2557,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #else float max_scale = 0, amax = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1); + scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL); float abs_scale = fabsf(scales[j]); if (abs_scale > amax) { amax = abs_scale; @@ -2460,6 +2668,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n return (n/QK_K*sizeof(block_q5_K)); } +static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q5_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + uint8_t L[QK_K]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; + + for (int i = 0; i < nb; i++) { + + float sum_x2 = 0; + for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; + float sigma2 = sum_x2/QK_K; + float av_x = sqrtf(sigma2); + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 32*j; + for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]); + } else { + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + } + scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(31, l)); + L[32*j + ii] = l; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); + + uint8_t m1 = 1, m2 = 2; + for (int n = 0; n < QK_K; n += 64) { + for (int j = 0; j < 32; ++j) { + int l1 = L[n + j]; + if (l1 > 15) { + l1 -= 16; qh[j] |= m1; + } + int l2 = L[n + j + 32]; + if (l2 > 15) { + l2 -= 16; qh[j] |= m2; + } + ql[j] = l1 | (l2 << 4); + } + m1 <<= 2; m2 <<= 2; + ql += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); + if (!quant_weights) { + quantize_row_q5_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== 6-bit (de)-quantization void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { @@ -2476,7 +2801,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict for (int ib = 0; ib < QK_K/16; ++ib) { - const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1); + const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); scales[ib] = scale; const float abs_scale = fabsf(scale); @@ -2608,6 +2933,112 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * return (n/QK_K*sizeof(block_q6_K)); } +static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q6_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + int8_t L[QK_K]; + float scales[QK_K/16]; + //float weights[16]; + + for (int i = 0; i < nb; i++) { + + //float sum_x2 = 0; + //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j]; + //float sigma2 = sum_x2/QK_K; + + float max_scale = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + + float scale; + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 16*ib; + //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]); + //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights); + scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw); + } else { + scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); + } + scales[ib] = scale; + + const float abs_scale = fabsf(scale); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; + max_scale = scale; + } + + } + + if (!max_abs_scale) { + memset(&y[i], 0, sizeof(block_q6_K)); + y[i].d = GGML_FP32_TO_FP16(0.f); + x += QK_K; + continue; + } + + float iscale = -128.f/max_scale; + y[i].d = GGML_FP32_TO_FP16(1/iscale); + for (int ib = 0; ib < QK_K/16; ++ib) { + y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); + } + + for (int j = 0; j < QK_K/16; ++j) { + float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); + L[16*j + ii] = l + 32; + } + } + + uint8_t * restrict ql = y[i].ql; + uint8_t * restrict qh = y[i].qh; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[j + l + 0] & 0xF; + const uint8_t q2 = L[j + l + 32] & 0xF; + const uint8_t q3 = L[j + l + 64] & 0xF; + const uint8_t q4 = L[j + l + 96] & 0xF; + ql[l+ 0] = q1 | (q3 << 4); + ql[l+32] = q2 | (q4 << 4); + qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); + } + ql += 64; + qh += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); + if (!quant_weights) { + quantize_row_q6_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== "True" 2-bit (de)-quantization static const uint64_t iq2xxs_grid[256] = { diff --git a/ggml-quants.h b/ggml-quants.h index e5d1102304ba5..99467936aa724 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -249,4 +249,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); - +size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml.c b/ggml.c index 52467475a1f22..ef5888ab21538 100644 --- a/ggml.c +++ b/ggml.c @@ -18713,26 +18713,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_Q3_K: { GGML_ASSERT(start % QK_K == 0); - block_q3_K * block = (block_q3_K*)dst + start / QK_K; - result = ggml_quantize_q3_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q4_K: { GGML_ASSERT(start % QK_K == 0); - block_q4_K * block = (block_q4_K*)dst + start / QK_K; - result = ggml_quantize_q4_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q5_K: { GGML_ASSERT(start % QK_K == 0); - block_q5_K * block = (block_q5_K*)dst + start / QK_K; - result = ggml_quantize_q5_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q6_K: { GGML_ASSERT(start % QK_K == 0); - block_q6_K * block = (block_q6_K*)dst + start / QK_K; - result = ggml_quantize_q6_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_IQ2_XXS: { From a836c8f534ab789b02da149fbdaf7735500bff74 Mon Sep 17 00:00:00 2001 From: David Pflug Date: Sun, 14 Jan 2024 10:46:00 -0500 Subject: [PATCH 064/131] llama : fix missing quotes (#4937) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 63f37ecdb8511..7af38718c4130 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7099,7 +7099,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); @@ -7120,7 +7120,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); From 4a3156de2fac9a8ee4279de7804d4e352dcfe121 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 15 Jan 2024 07:48:06 +0200 Subject: [PATCH 065/131] CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938) Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd3814c72b407..a870718a745cb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in #endif // GGML_CUDA_F16 } +template +static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = __half2float(x->d); + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template +static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const float2 d = __half22float2(x->dm); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d.x * (q[l] & 0xF) + d.y; + y[l+16] = d.x * (q[l] >> 4) + d.y; + } +} + //================================== k-quants template @@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu #endif } +template +static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_0<<>>(vx, y, nb32); +} + +template +static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_1<<>>(vx, y, nb32); +} + template static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; @@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { int id; switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: @@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: From 2faaef39799c97a53bec3898141478700da25757 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:09:38 +0200 Subject: [PATCH 066/131] llama : check for 256 divisibility for IQ2_XS, IQ2_XXS (#4950) Co-authored-by: Iwan Kawrakow --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 7af38718c4130..f9718060d5ea9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8559,7 +8559,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty //} bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || + new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -8571,6 +8572,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } if (convert_incompatible_tensor) { switch (new_type) { + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; From ddb008d845cd50bb090bf051f570130524042936 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 13:27:00 +0200 Subject: [PATCH 067/131] cuda : fix dequantize kernel names (#4938) --- ggml-cuda.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a870718a745cb..c3e14bc96ec38 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6309,14 +6309,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu } template -static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb32 = k / 32; const int nb = (k + 255) / 256; dequantize_block_q4_0<<>>(vx, y, nb32); } template -static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb32 = k / 32; const int nb = (k + 255) / 256; dequantize_block_q4_1<<>>(vx, y, nb32); @@ -6370,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { int id; switch (type) { case GGML_TYPE_Q4_0: - return dequantize_q4_0_cuda; + return dequantize_row_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_q4_1_cuda; + return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: @@ -6407,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return dequantize_q4_0_cuda; + return dequantize_row_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_q4_1_cuda; + return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: From d9aa4ffa6e0296d42f1f676dd85de97c8491eb73 Mon Sep 17 00:00:00 2001 From: "Victor Z. Peng" Date: Mon, 15 Jan 2024 04:41:46 -0800 Subject: [PATCH 068/131] awq-py : fix typo in awq-py/README.md (#4947) --- awq-py/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awq-py/README.md b/awq-py/README.md index 59354f4e329a2..16e68d027e239 100644 --- a/awq-py/README.md +++ b/awq-py/README.md @@ -43,7 +43,7 @@ Example for llama model # For llama7b and llama2 models python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf # For mistral and mpt models -python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf +python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf ``` ## Quantize From 4483396751c79dea540808b9cb9238245d06da2b Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 15 Jan 2024 14:06:52 +0100 Subject: [PATCH 069/131] llama : apply classifier-free guidance to logits directly (#4951) --- common/sampling.cpp | 9 ++++---- llama.cpp | 56 ++++++++++++++++++++++++++++++--------------- llama.h | 17 ++++++++++---- 3 files changed, 55 insertions(+), 27 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 8e45909f1faf2..dd1ffeb1b8083 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl( logits[it->first] += it->second; } + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + cur.clear(); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - if (ctx_cfg) { - llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale); - } - // apply penalties const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); diff --git a/llama.cpp b/llama.cpp index f9718060d5ea9..46c4d11c88873 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7898,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) { } } +void llama_sample_apply_guidance( + struct llama_context * ctx, + float * logits, + float * logits_guidance, + float scale) { + GGML_ASSERT(ctx); + + const auto t_start_sample_us = ggml_time_us(); + const auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + + llama_log_softmax(logits, n_vocab); + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + auto & l = logits[i]; + const auto & g = logits_guidance[i]; + + l = scale * (l - g) + g; + } + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; +} + void llama_sample_classifier_free_guidance( struct llama_context * ctx, llama_token_data_array * candidates, struct llama_context * guidance_ctx, float scale) { - int64_t t_start_sample_us = ggml_time_us(); - GGML_ASSERT(ctx); + int64_t t_start_sample_us; - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + t_start_sample_us = ggml_time_us(); + const size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); - GGML_ASSERT(n_vocab == (int)candidates->size); + GGML_ASSERT(n_vocab == candidates->size); GGML_ASSERT(!candidates->sorted); - std::vector logits_base; - logits_base.reserve(candidates->size); - for (size_t i = 0; i < candidates->size; ++i) { - logits_base.push_back(candidates->data[i].logit); + std::vector logits_base(n_vocab); + for (size_t i = 0; i < n_vocab; ++i) { + logits_base[i] = candidates->data[i].logit; } - llama_log_softmax(logits_base.data(), candidates->size); - float* logits_guidance = llama_get_logits(guidance_ctx); - llama_log_softmax(logits_guidance, n_vocab); + float * logits_guidance = llama_get_logits(guidance_ctx); - for (int i = 0; i < n_vocab; ++i) { - float logit_guidance = logits_guidance[i]; - float logit_base = logits_base[i]; - candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance; - } + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale); + t_start_sample_us = ggml_time_us(); - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + for (size_t i = 0; i < n_vocab; ++i) { + candidates->data[i].logit = logits_base[i]; } + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { diff --git a/llama.h b/llama.h index 79c8335b66bdf..a570b0d6968fb 100644 --- a/llama.h +++ b/llama.h @@ -714,14 +714,21 @@ extern "C" { float penalty_present); /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. - /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - LLAMA_API void llama_sample_classifier_free_guidance( + /// @param logits Logits extracted from the original generation context. + /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. + LLAMA_API void llama_sample_apply_guidance( + struct llama_context * ctx, + float * logits, + float * logits_guidance, + float scale); + + LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance( struct llama_context * ctx, llama_token_data_array * candidates, struct llama_context * guidance_ctx, - float scale); + float scale), + "use llama_sample_apply_guidance() instead"); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax( From 3e5ca7931c68152e4ec18d126e9c832dd84914c8 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 15 Jan 2024 20:40:48 +0200 Subject: [PATCH 070/131] pass cpu-architecture arguments only to host code (C;C++) (#4943) --- CMakeLists.txt | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2741568ed3430..7bd64096626a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -594,6 +594,13 @@ if (NOT MSVC) endif() endif() +function(add_compile_option_cpp ARG) + # Adds a compile option to C/C++ only, but not for Cuda. + # Use, e.g., for CPU-architecture flags. + add_compile_options($<$:${ARG}>) + add_compile_options($<$:${ARG}>) +endfunction() + if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) message(STATUS "ARM detected") if (MSVC) @@ -628,8 +635,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE include(cmake/FindSIMD.cmake) endif () if (LLAMA_AVX512) - add_compile_options($<$:/arch:AVX512>) - add_compile_options($<$:/arch:AVX512>) + add_compile_option_cpp(/arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -643,37 +649,35 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_options($<$:/arch:AVX2>) - add_compile_options($<$:/arch:AVX2>) + add_compile_option_cpp(/arch:AVX2) elseif (LLAMA_AVX) - add_compile_options($<$:/arch:AVX>) - add_compile_options($<$:/arch:AVX>) + add_compile_option_cpp(/arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_options(-march=native) + add_compile_option_cpp(-march=native) endif() if (LLAMA_F16C) - add_compile_options(-mf16c) + add_compile_option_cpp(-mf16c) endif() if (LLAMA_FMA) - add_compile_options(-mfma) + add_compile_option_cpp(-mfma) endif() if (LLAMA_AVX) - add_compile_options(-mavx) + add_compile_option_cpp(-mavx) endif() if (LLAMA_AVX2) - add_compile_options(-mavx2) + add_compile_option_cpp(-mavx2) endif() if (LLAMA_AVX512) - add_compile_options(-mavx512f) - add_compile_options(-mavx512bw) + add_compile_option_cpp(-mavx512f) + add_compile_option_cpp(-mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_options(-mavx512vbmi) + add_compile_option_cpp(-mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_options(-mavx512vnni) + add_compile_option_cpp(-mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") From e0324285a569d0583cf2f4a07a2402221ee25f58 Mon Sep 17 00:00:00 2001 From: stduhpf Date: Tue, 16 Jan 2024 12:04:32 +0100 Subject: [PATCH 071/131] speculative : threading options (#4959) * speculative: expose draft threading * fix usage format * accept -td and -tbd args * speculative: revert default behavior when -td is unspecified * fix trailing whitespace --- common/common.cpp | 22 ++++++++++++++++++++++ common/common.h | 2 ++ examples/speculative/speculative.cpp | 4 ++++ 3 files changed, 28 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index c11006bcb9175..2b0865fff0e62 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } + } else if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -845,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -td N, --threads-draft N"); + printf(" number of threads to use during generation (default: same as --threads)"); + printf(" -tbd N, --threads-batch-draft N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); diff --git a/common/common.h b/common/common.h index 096468243d88c..1f43e6282f48d 100644 --- a/common/common.h +++ b/common/common.h @@ -46,7 +46,9 @@ struct gpt_params { uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 20f1fb5bfcd99..7b3af01f339a9 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -65,6 +65,10 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; + if (params.n_threads_draft > 0) { + params.n_threads = params.n_threads_draft; + } + params.n_threads_batch = params.n_threads_batch_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); { From d75c232e1da56f19ac4d2530dadbe0ab3a11fde5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Jan 2024 12:14:19 +0100 Subject: [PATCH 072/131] finetune : use LLAMA_FILE_MAGIC_GGLA (#4961) This commit replaces the magic number LLAMA_FILE_MAGIC_LORA used in finetune.cpp with LLAMA_FILE_MAGIC_GGLA defined in llama.h. Signed-off-by: Daniel Bevenius --- examples/finetune/finetune.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index eaca42fc1c356..a6620fd73ca18 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor return tn_buf.data(); }; - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); From a0b3ac8c48b66206b9c5921ce57bd5c0ea6557c3 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Tue, 16 Jan 2024 03:16:33 -0800 Subject: [PATCH 073/131] ggml : introduce GGML_CALL function annotation (#4850) This change makes it possible to build ggml-cuda.cu and ggml-metal.m as independent dynamic shared objects, that may be conditionally linked at runtime in a multiplatform binary. It introduces a GGML_CALL annotation that documents which functions have a cyclic call relationship, between the application code and GPU modules. This change does nothing, unless the build defines -DGGML_MULTIPLATFORM which causes back-references and function pointers to conform to MS ABI which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms --- ggml-backend-impl.h | 60 +++++++++++----------- ggml-backend.c | 80 ++++++++++++++--------------- ggml-backend.h | 50 +++++++++--------- ggml-cuda.cu | 121 ++++++++++++++++++++++---------------------- ggml-cuda.h | 32 ++++++------ ggml-metal.h | 4 +- ggml-metal.m | 42 +++++++-------- ggml.c | 32 ++++++------ ggml.h | 58 ++++++++++++--------- 9 files changed, 244 insertions(+), 235 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 1db32901fe6c7..1397828d9ac71 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -16,14 +16,14 @@ extern "C" { typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { - const char * (*get_name) (ggml_backend_buffer_type_t buft); - ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) - bool (*is_host) (ggml_backend_buffer_type_t buft); + bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -35,15 +35,15 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - const char * (*get_name) (ggml_backend_buffer_t buffer); - void (*free_buffer)(ggml_backend_buffer_t buffer); - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); + void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); + void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); + void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -54,7 +54,7 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, @@ -70,31 +70,31 @@ extern "C" { typedef void * ggml_backend_context_t; struct ggml_backend_i { - const char * (*get_name)(ggml_backend_t backend); + const char * (*GGML_CALL get_name)(ggml_backend_t backend); - void (*free)(ggml_backend_t backend); + void (*GGML_CALL free)(ggml_backend_t backend); // buffer allocation - ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchronous tensor data access - void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations - void (*synchronize)(ggml_backend_t backend); + void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation - bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); }; struct ggml_backend { @@ -107,9 +107,9 @@ extern "C" { // Backend registry // - typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data); + typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); - void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); #ifdef __cplusplus } diff --git a/ggml-backend.c b/ggml-backend.c index 505dbba476253..f5424fb904117 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } -ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } -size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { return buft->iface.get_alloc_size(buft, tensor); @@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { // backend buffer -ggml_backend_buffer_t ggml_backend_buffer_init( +GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, @@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return base; } -void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); @@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } } -void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } -void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -318,9 +318,9 @@ struct ggml_backend_reg { static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG]; static size_t ggml_backend_registry_count = 0; -static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); +GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); -static void ggml_backend_registry_init(void) { +GGML_CALL static void ggml_backend_registry_init(void) { static bool initialized = false; if (initialized) { @@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) { // add forward decls here to avoid including the backend headers #ifdef GGML_USE_CUBLAS - extern void ggml_backend_cuda_reg_devices(void); + extern GGML_CALL void ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); #endif #ifdef GGML_USE_METAL - extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); - extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); + extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); + extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); #endif } -void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { +GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG); size_t id = ggml_backend_registry_count; @@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU -static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { return "CPU"; GGML_UNUSED(buffer); } -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } @@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; GGML_UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cpu(backend); GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, @@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include -static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU_HBM"; GGML_UNUSED(buft); } -static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { +GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { return "CPU_HBM"; GGML_UNUSED(buf); } -static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); @@ -617,20 +617,20 @@ struct ggml_backend_cpu_context { size_t work_size; }; -static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -static void ggml_backend_cpu_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; free(cpu_ctx->work_data); free(cpu_ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { +GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend return cpu_plan; } -static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; free(cpu_plan->cplan.work_data); @@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac GGML_UNUSED(backend); } -static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c return true; } -static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; @@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { +GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_cpu_name; } @@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } -static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { +GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { return ggml_backend_cpu_init(); GGML_UNUSED(params); diff --git a/ggml-backend.h b/ggml-backend.h index 4eb244af1d3e7..12b4b4ab74935 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -17,12 +17,12 @@ extern "C" { // // buffer type - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer enum ggml_backend_buffer_usage { @@ -30,18 +30,18 @@ extern "C" { GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, }; - GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); // // Backend @@ -58,8 +58,8 @@ extern "C" { GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -80,13 +80,13 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); // Create a backend buffer from an existing pointer - GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); #ifdef GGML_USE_CPU_HBM GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); @@ -183,7 +183,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c3e14bc96ec38..568c411afd3ee 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7615,11 +7615,11 @@ struct cuda_pool_alloc { static bool g_cublas_loaded = false; -bool ggml_cublas_loaded(void) { +GGML_CALL bool ggml_cublas_loaded(void) { return g_cublas_loaded; } -void ggml_init_cublas() { +GGML_CALL void ggml_init_cublas() { static bool initialized = false; if (!initialized) { @@ -7707,7 +7707,7 @@ void ggml_init_cublas() { } } -void * ggml_cuda_host_malloc(size_t size) { +GGML_CALL void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; } @@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) { return ptr; } -void ggml_cuda_host_free(void * ptr) { +GGML_CALL void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } @@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { if (!g_cublas_loaded) return false; const int64_t ne10 = src1->ne[0]; @@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -static void ggml_cuda_set_main_device(const int main_device) { +GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) { } } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; ggml_cuda_func_t func; @@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ return true; } -int ggml_cuda_get_device_count() { +GGML_CALL int ggml_cuda_get_device_count() { int device_count; if (cudaGetDeviceCount(&device_count) != cudaSuccess) { return 0; @@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() { return device_count; } -void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { +GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); @@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context { } }; -static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->name.c_str(); } -static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; } -static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } -static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } -static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { @@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } } -static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg CUDA_CHECK(cudaDeviceSynchronize()); } -static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co CUDA_CHECK(cudaDeviceSynchronize()); } -static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co return false; } -static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { }; // cuda buffer type - struct ggml_backend_cuda_buffer_type_context { int device; std::string name; }; -static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; return ctx->name.c_str(); } -static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); @@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } -static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t UNUSED(buft); } -static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { if (!ggml_backend_is_cuda(backend)) { return false; } @@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .is_host = */ NULL, }; -ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // FIXME: this is not thread safe if (device >= ggml_backend_cuda_get_device_count()) { return nullptr; @@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context { std::vector tensor_extras; }; -static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Split"; UNUSED(buffer); @@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; //} -static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; } -static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced return (void *)0x1000; UNUSED(buffer); } -static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf tensor->extra = extra; } -static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff } } -static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff } } -static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { UNUSED(buffer); UNUSED(value); } @@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { // cuda split buffer type -static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Split"; UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, @@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); } -static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu return total_size; } -static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cuda(backend); UNUSED(buft); } -static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; UNUSED(buft); @@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten // host buffer type -static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Host"; UNUSED(buft); } -static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Host"; UNUSED(buffer); } -static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { @@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return buffer; } -ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, @@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend -static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return cuda_ctx->name.c_str(); } -static void ggml_backend_cuda_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } -static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } -static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { @@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm return false; } -static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); @@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph return true; } -static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .supports_op = */ ggml_backend_cuda_supports_op, }; -ggml_backend_t ggml_backend_cuda_init(int device) { +GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { ggml_init_cublas(); // TODO: remove from ggml.c if (device < 0 || device >= ggml_cuda_get_device_count()) { @@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } -bool ggml_backend_is_cuda(ggml_backend_t backend) { +GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_cuda_name; } -int ggml_backend_cuda_get_device_count() { +GGML_CALL int ggml_backend_cuda_get_device_count() { return ggml_cuda_get_device_count(); } -void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { +GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { ggml_cuda_get_device_description(device, description, description_size); } -void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { +GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { ggml_cuda_set_device(device); CUDA_CHECK(cudaMemGetInfo(free, total)); } // backend registry -static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { +GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; UNUSED(params); } -extern "C" int ggml_backend_cuda_reg_devices(); +extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); -int ggml_backend_cuda_reg_devices() { +GGML_CALL int ggml_backend_cuda_reg_devices() { int device_count = ggml_cuda_get_device_count(); //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization for (int i = 0; i < device_count; i++) { diff --git a/ggml-cuda.h b/ggml-cuda.h index d19cbf3fdd04b..b1ebd61d7fb66 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -18,34 +18,34 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. -GGML_API void ggml_init_cublas(void); +GGML_API GGML_CALL void ggml_init_cublas(void); // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. -GGML_API bool ggml_cublas_loaded(void); +GGML_API GGML_CALL bool ggml_cublas_loaded(void); -GGML_API void * ggml_cuda_host_malloc(size_t size); -GGML_API void ggml_cuda_host_free(void * ptr); +GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); +GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); -GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); -GGML_API int ggml_cuda_get_device_count(void); -GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL int ggml_cuda_get_device_count(void); +GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size); // backend API -GGML_API ggml_backend_t ggml_backend_cuda_init(int device); +GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); -GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); -GGML_API int ggml_backend_cuda_get_device_count(void); -GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); +GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); #ifdef __cplusplus } diff --git a/ggml-metal.h b/ggml-metal.h index cd5e2995f66f6..8b0bfc5f10329 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); +GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); -GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family // ideally, the user code should be doing these checks diff --git a/ggml-metal.m b/ggml-metal.m index 2ca726055f9ea..867f2fd48cbd2 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) { } } -static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { return "Metal"; UNUSED(buffer); } -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -2363,13 +2363,13 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ // default buffer type -static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } -static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, @@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { // buffer from ptr -ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = data; @@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz // backend -static const char * ggml_backend_metal_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -static void ggml_backend_metal_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } -static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); @@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { return ggml_backend_metal_init(); GGML_UNUSED(params); diff --git a/ggml.c b/ggml.c index ef5888ab21538..5779f32d297e3 100644 --- a/ggml.c +++ b/ggml.c @@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -int64_t ggml_nelements(const struct ggml_tensor * tensor) { +GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int64_t ggml_nrows(const struct ggml_tensor * tensor) { +GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -size_t ggml_nbytes(const struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { @@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -int ggml_blck_size(enum ggml_type type) { +GGML_CALL int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } -size_t ggml_type_size(enum ggml_type type) { +GGML_CALL size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -size_t ggml_row_size(enum ggml_type type, int64_t ne) { +GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); return ggml_type_size(type)*ne/ggml_blck_size(type); } @@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } -const char * ggml_type_name(enum ggml_type type) { +GGML_CALL const char * ggml_type_name(enum ggml_type type) { return type_traits[type].type_name; } -bool ggml_is_quantized(enum ggml_type type) { +GGML_CALL bool ggml_is_quantized(enum ggml_type type) { return type_traits[type].is_quantized; } -const char * ggml_op_name(enum ggml_op op) { +GGML_CALL const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } @@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } -const char * ggml_op_desc(const struct ggml_tensor * t) { +GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); @@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) { } } -size_t ggml_element_size(const struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } @@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -bool ggml_is_transposed(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool ggml_is_permuted(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; @@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } -enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { +GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } @@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init( } } -void ggml_rope_yarn_corr_dims( +GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims diff --git a/ggml.h b/ggml.h index 1187074f7f174..837c52e68c90c 100644 --- a/ggml.h +++ b/ggml.h @@ -187,6 +187,16 @@ # define GGML_API #endif +#ifdef GGML_MULTIPLATFORM +# if defined(_WIN32) +# define GGML_CALL +# else +# define GGML_CALL __attribute__((__ms_abi__)) +# endif +#else +# define GGML_CALL +#endif + // TODO: support for clang #ifdef __GNUC__ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) @@ -649,41 +659,41 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); - GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API int ggml_blck_size(enum ggml_type type); - GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type); + GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row GGML_DEPRECATED( GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float "use ggml_row_size() instead"); - GGML_API const char * ggml_type_name(enum ggml_type type); - GGML_API const char * ggml_op_name (enum ggml_op op); - GGML_API const char * ggml_op_symbol(enum ggml_op op); + GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type); + GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); - GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name - GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_quantized(enum ggml_type type); + GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type); // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); - GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -770,7 +780,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); @@ -1413,7 +1423,7 @@ extern "C" { float beta_slow); // compute correction dims for YaRN RoPE scaling - void ggml_rope_yarn_corr_dims( + GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) From 122ed4840cc6d209df6043e027f9f8a03aee01da Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Tue, 16 Jan 2024 13:10:48 +0100 Subject: [PATCH 074/131] examples : fix and improv docs for the grammar generator (#4909) * Create pydantic-models-to-grammar.py * Added some comments for usage * Refactored Grammar Generator Added example and usage instruction. * Update pydantic_models_to_grammar.py * Update pydantic-models-to-grammar-examples.py * Renamed module and imported it. * Update pydantic-models-to-grammar.py * Renamed file and fixed grammar generator issue. * Fixed some issues and bugs of the grammar generator. Imporved Documentation * Update pydantic_models_to_grammar.py --- examples/pydantic_models_to_grammar.py | 835 +++++++++++++++---------- 1 file changed, 498 insertions(+), 337 deletions(-) diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 41b98fdc1fcb4..848c1c367d701 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -4,6 +4,7 @@ from inspect import isclass, getdoc from types import NoneType +from docstring_parser import parse from pydantic import BaseModel, create_model, Field from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias from enum import Enum @@ -25,9 +26,10 @@ class PydanticDataType(Enum): ENUM (str): Represents an enum data type. CUSTOM_CLASS (str): Represents a custom class data type. """ + STRING = "string" TRIPLE_QUOTED_STRING = "triple_quoted_string" - MARKDOWN_STRING = "markdown_string" + MARKDOWN_CODE_BLOCK = "markdown_code_block" BOOLEAN = "boolean" INTEGER = "integer" FLOAT = "float" @@ -78,10 +80,10 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: def format_model_and_field_name(model_name: str) -> str: - parts = re.findall('[A-Z][^A-Z]*', model_name) + parts = re.findall("[A-Z][^A-Z]*", model_name) if not parts: # Check if the list is empty return model_name.lower().replace("_", "-") - return '-'.join(part.lower().replace("_", "-") for part in parts) + return "-".join(part.lower().replace("_", "-") for part in parts) def generate_list_rule(element_type): @@ -93,29 +95,31 @@ def generate_list_rule(element_type): """ rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list" element_rule = map_pydantic_type_to_gbnf(element_type) - list_rule = fr'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' + list_rule = rf'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' return list_rule def get_members_structure(cls, rule_name): if issubclass(cls, Enum): # Handle Enum types - members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()] + members = [f'"\\"{member.value}\\""' for name, member in cls.__members__.items()] return f"{cls.__name__.lower()} ::= " + " | ".join(members) if cls.__annotations__ and cls.__annotations__ != {}: result = f'{rule_name} ::= "{{"' type_list_rules = [] # Modify this comprehension - members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param_type)}' - for name, param_type in cls.__annotations__.items() - if name != 'self'] + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' + for name, param_type in cls.__annotations__.items() + if name != "self" + ] result += '"," '.join(members) result += ' "}"' return result, type_list_rules elif rule_name == "custom-class-any": - result = f'{rule_name} ::= ' - result += 'value' + result = f"{rule_name} ::= " + result += "value" type_list_rules = [] return result, type_list_rules else: @@ -124,9 +128,11 @@ def get_members_structure(cls, rule_name): result = f'{rule_name} ::= "{{"' type_list_rules = [] # Modify this comprehension too - members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param.annotation)}' - for name, param in parameters.items() - if name != 'self' and param.annotation != inspect.Parameter.empty] + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != "self" and param.annotation != inspect.Parameter.empty + ] result += '", "'.join(members) result += ' "}"' @@ -141,8 +147,8 @@ def regex_to_gbnf(regex_pattern: str) -> str: gbnf_rule = regex_pattern # Translate common regex components to GBNF - gbnf_rule = gbnf_rule.replace('\\d', '[0-9]') - gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]') + gbnf_rule = gbnf_rule.replace("\\d", "[0-9]") + gbnf_rule = gbnf_rule.replace("\\s", "[ \t\n]") # Handle quantifiers and other regex syntax that is similar in GBNF # (e.g., '*', '+', '?', character classes) @@ -158,12 +164,12 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None): Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits. Parameters: - max_digit (int): The maximum number of digits for the integer. Default is None. - min_digit (int): The minimum number of digits for the integer. Default is None. + max_digit (int): The maximum number of digits for the integer. Default is None. + min_digit (int): The minimum number of digits for the integer. Default is None. Returns: - integer_rule (str): The identifier for the integer rule generated. - additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. + integer_rule (str): The identifier for the integer rule generated. + additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. """ additional_rules = [] @@ -178,21 +184,21 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None): # Handling Integer Rules if max_digit is not None or min_digit is not None: # Start with an empty rule part - integer_rule_part = '' + integer_rule_part = "" # Add mandatory digits as per min_digit if min_digit is not None: - integer_rule_part += '[0-9] ' * min_digit + integer_rule_part += "[0-9] " * min_digit # Add optional digits up to max_digit if max_digit is not None: optional_digits = max_digit - (min_digit if min_digit is not None else 0) - integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)]) + integer_rule_part += "".join(["[0-9]? " for _ in range(optional_digits)]) # Trim the rule part and append it to additional rules integer_rule_part = integer_rule_part.strip() if integer_rule_part: - additional_rules.append(f'{integer_rule} ::= {integer_rule_part}') + additional_rules.append(f"{integer_rule} ::= {integer_rule_part}") return integer_rule, additional_rules @@ -224,21 +230,26 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None additional_rules = [] # Define the integer part rule - integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( + integer_part_rule = ( + "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( f"-min{min_digit}" if min_digit is not None else "") + ) # Define the fractional part rule based on precision constraints fractional_part_rule = "fractional-part" - fractional_rule_part = '' + fractional_rule_part = "" if max_precision is not None or min_precision is not None: fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + ( - f"-min{min_precision}" if min_precision is not None else "") + f"-min{min_precision}" if min_precision is not None else "" + ) # Minimum number of digits - fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1) + fractional_rule_part = "[0-9]" * (min_precision if min_precision is not None else 1) # Optional additional digits - fractional_rule_part += ''.join([' [0-9]?'] * ( - (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0)) - additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}') + fractional_rule_part += "".join( + [" [0-9]?"] * ((max_precision - ( + min_precision if min_precision is not None else 1)) if max_precision is not None else 0) + ) + additional_rules.append(f"{fractional_part_rule} ::= {fractional_rule_part}") # Define the float rule float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}" @@ -246,20 +257,19 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None # Generating the integer part rule definition, if necessary if max_digit is not None or min_digit is not None: - integer_rule_part = '[0-9]' + integer_rule_part = "[0-9]" if min_digit is not None and min_digit > 1: - integer_rule_part += ' [0-9]' * (min_digit - 1) + integer_rule_part += " [0-9]" * (min_digit - 1) if max_digit is not None: - integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1))) - additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}') + integer_rule_part += "".join([" [0-9]?"] * (max_digit - (min_digit if min_digit is not None else 1))) + additional_rules.append(f"{integer_part_rule} ::= {integer_rule_part.strip()}") return float_rule, additional_rules -def generate_gbnf_rule_for_type(model_name, field_name, - field_type, is_optional, processed_models, created_rules, - field_info=None) -> \ - Tuple[str, list]: +def generate_gbnf_rule_for_type( + model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None +) -> Tuple[str, list]: """ Generate GBNF rule for a given field type. @@ -282,20 +292,19 @@ def generate_gbnf_rule_for_type(model_name, field_name, if isclass(field_type) and issubclass(field_type, BaseModel): nested_model_name = format_model_and_field_name(field_type.__name__) - nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules) + nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules) rules.extend(nested_model_rules) gbnf_type, rules = nested_model_name, rules elif isclass(field_type) and issubclass(field_type, Enum): - enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type] # Adding escaped quotes + enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}" rules.append(enum_rule) gbnf_type, rules = model_name + "-" + field_name, rules - elif get_origin(field_type) == list or field_type == list: # Array + elif get_origin(field_type) == list: # Array element_type = get_args(field_type)[0] - element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-element", - element_type, is_optional, processed_models, - created_rules) + element_rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules + ) rules.extend(additional_rules) array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ rules.append(array_rule) @@ -303,10 +312,9 @@ def generate_gbnf_rule_for_type(model_name, field_name, elif get_origin(field_type) == set or field_type == set: # Array element_type = get_args(field_type)[0] - element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-element", - element_type, is_optional, processed_models, - created_rules) + element_rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules + ) rules.extend(additional_rules) array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ rules.append(array_rule) @@ -318,15 +326,13 @@ def generate_gbnf_rule_for_type(model_name, field_name, elif gbnf_type.startswith("custom-dict-"): key_type, value_type = get_args(field_type) - additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-key-type", - key_type, is_optional, processed_models, - created_rules) - additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-value-type", - value_type, is_optional, - processed_models, created_rules) - gbnf_type = fr'{gbnf_type} ::= "{{" ( {additional_key_type} ":" {additional_value_type} ("," {additional_key_type} ":" {additional_value_type})* )? "}}" ' + additional_key_type, additional_key_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-key-type", key_type, is_optional, processed_models, created_rules + ) + additional_value_type, additional_value_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-value-type", value_type, is_optional, processed_models, created_rules + ) + gbnf_type = rf'{gbnf_type} ::= "{{" ( {additional_key_type} ": " {additional_value_type} ("," "\n" ws {additional_key_type} ":" {additional_value_type})* )? "}}" ' rules.extend(additional_key_rules) rules.extend(additional_value_rules) @@ -336,19 +342,16 @@ def generate_gbnf_rule_for_type(model_name, field_name, for union_type in union_types: if isinstance(union_type, _GenericAlias): - union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, - field_name, union_type, - False, - processed_models, created_rules) + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( + model_name, field_name, union_type, False, processed_models, created_rules + ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) - elif not issubclass(union_type, NoneType): - union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, - field_name, union_type, - False, - processed_models, created_rules) + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( + model_name, field_name, union_type, False, processed_models, created_rules + ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) @@ -363,45 +366,58 @@ def generate_gbnf_rule_for_type(model_name, field_name, else: gbnf_type = f"{model_name}-{field_name}-union" elif isclass(field_type) and issubclass(field_type, str): - if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None: - - triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False) - markdown_string = field_info.json_schema_extra.get('markdown_string', False) + if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None: + triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False) + markdown_string = field_info.json_schema_extra.get("markdown_code_block", False) gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value - gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type + gbnf_type = PydanticDataType.MARKDOWN_CODE_BLOCK.value if markdown_string else gbnf_type - elif field_info and hasattr(field_info, 'pattern'): + elif field_info and hasattr(field_info, "pattern"): # Convert regex pattern to grammar rule regex_pattern = field_info.regex.pattern gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}" else: gbnf_type = PydanticDataType.STRING.value - elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info, - 'json_schema_extra') and field_info.json_schema_extra is not None: + elif ( + isclass(field_type) + and issubclass(field_type, float) + and field_info + and hasattr(field_info, "json_schema_extra") + and field_info.json_schema_extra is not None + ): # Retrieve precision attributes for floats - max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info, - 'json_schema_extra') else None - max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None + max_precision = ( + field_info.json_schema_extra.get("max_precision") if field_info and hasattr(field_info, + "json_schema_extra") else None + ) + min_precision = ( + field_info.json_schema_extra.get("min_precision") if field_info and hasattr(field_info, + "json_schema_extra") else None + ) + max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None + min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None # Generate GBNF rule for float with given attributes - gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits, - max_precision=max_precision, - min_precision=min_precision) - - elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info, - 'json_schema_extra') and field_info.json_schema_extra is not None: + gbnf_type, rules = generate_gbnf_float_rules( + max_digit=max_digits, min_digit=min_digits, max_precision=max_precision, min_precision=min_precision + ) + + elif ( + isclass(field_type) + and issubclass(field_type, int) + and field_info + and hasattr(field_info, "json_schema_extra") + and field_info.json_schema_extra is not None + ): # Retrieve digit attributes for integers - max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None + max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None + min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None # Generate GBNF rule for integer with given attributes gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits) @@ -443,13 +459,13 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created if not issubclass(model, BaseModel): # For non-Pydantic classes, generate model_fields from __annotations__ or __init__ - if hasattr(model, '__annotations__') and model.__annotations__: + if hasattr(model, "__annotations__") and model.__annotations__: model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} else: init_signature = inspect.signature(model.__init__) parameters = init_signature.parameters - model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() - if name != 'self'} + model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() if + name != "self"} else: # For Pydantic models, use model_fields and check for ellipsis (required fields) model_fields = model.__annotations__ @@ -469,51 +485,55 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created field_type = field_info field_info = model.model_fields[field_name] is_optional = field_info.is_required is False and get_origin(field_type) is Optional - rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - format_model_and_field_name(field_name), - field_type, is_optional, - processed_models, created_rules, field_info) - look_for_markdown_code_block = True if rule_name == "markdown_string" else False + rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, format_model_and_field_name(field_name), field_type, is_optional, processed_models, + created_rules, field_info + ) + look_for_markdown_code_block = True if rule_name == "markdown_code_block" else False look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False if not look_for_markdown_code_block and not look_for_triple_quoted_string: if rule_name not in created_rules: created_rules[rule_name] = additional_rules - model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": " {rule_name}') # Adding escaped quotes + model_rule_parts.append(f' ws "\\"{field_name}\\"" ":" ws {rule_name}') # Adding escaped quotes nested_rules.extend(additional_rules) else: - has_triple_quoted_string = look_for_markdown_code_block - has_markdown_code_block = look_for_triple_quoted_string + has_triple_quoted_string = look_for_triple_quoted_string + has_markdown_code_block = look_for_markdown_code_block fields_joined = r' "," "\n" '.join(model_rule_parts) - model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' - - if look_for_markdown_code_block or look_for_triple_quoted_string: - model_rule += ' ws "}"' + model_rule = rf'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' + has_special_string = False if has_triple_quoted_string: + model_rule += '"\\n" ws "}"' model_rule += '"\\n" triple-quoted-string' + has_special_string = True if has_markdown_code_block: + model_rule += '"\\n" ws "}"' model_rule += '"\\n" markdown-code-block' + has_special_string = True all_rules = [model_rule] + nested_rules - return all_rules, has_markdown_code_block, has_triple_quoted_string + return all_rules, has_special_string -def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None, - outer_object_content: str = None, list_of_outputs: bool = False) -> str: +def generate_gbnf_grammar_from_pydantic_models( + models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None, + list_of_outputs: bool = False +) -> str: """ Generate GBNF Grammar from Pydantic Models. This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated * grammar. - Parameters: - models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - list_of_outputs (str, optional): Allows a list of output objects + Args: + models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + list_of_outputs (str, optional): Allows a list of output objects Returns: - str: The generated GBNF grammar string. + str: The generated GBNF grammar string. Examples: models = [UserModel, PostModel] @@ -527,52 +547,53 @@ def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], ou all_rules = [] created_rules = {} if outer_object_name is None: - for model in models: - model_rules, _, _ = generate_gbnf_grammar(model, - processed_models, created_rules) + model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules) all_rules.extend(model_rules) if list_of_outputs: - root_rule = r'root ::= ws "[" grammar-models ("," grammar-models)* "]"' + "\n" + root_rule = r'root ::= (" "| "\n") "[" ws grammar-models ("," ws grammar-models)* ws "]"' + "\n" else: - root_rule = r'root ::= ws grammar-models' + "\n" + root_rule = r'root ::= (" "| "\n") grammar-models' + "\n" root_rule += "grammar-models ::= " + " | ".join( [format_model_and_field_name(model.__name__) for model in models]) all_rules.insert(0, root_rule) return "\n".join(all_rules) elif outer_object_name is not None: if list_of_outputs: - root_rule = fr'root ::= ws "[" {format_model_and_field_name(outer_object_name)} ("," {format_model_and_field_name(outer_object_name)})* "]"' + "\n" + root_rule = ( + rf'root ::= (" "| "\n") "[" ws {format_model_and_field_name(outer_object_name)} ("," ws {format_model_and_field_name(outer_object_name)})* ws "]"' + + "\n" + ) else: root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n" - model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\"" ": " grammar-models' + model_rule = ( + rf'{format_model_and_field_name(outer_object_name)} ::= (" "| "\n") "{{" ws "\"{outer_object_name}\"" ":" ws grammar-models' + ) fields_joined = " | ".join( - [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models]) + [rf"{format_model_and_field_name(model.__name__)}-grammar-model" for model in models]) - grammar_model_rules = f'\ngrammar-models ::= {fields_joined}' + grammar_model_rules = f"\ngrammar-models ::= {fields_joined}" mod_rules = [] for model in models: - mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws' - mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n' + mod_rule = rf"{format_model_and_field_name(model.__name__)}-grammar-model ::= " + mod_rule += ( + rf'"\"{model.__name__}\"" "," ws "\"{outer_object_content}\"" ":" ws {format_model_and_field_name(model.__name__)}' + "\n" + ) mod_rules.append(mod_rule) grammar_model_rules += "\n" + "\n".join(mod_rules) - look_for_markdown_code_block = False - look_for_triple_quoted_string = False + for model in models: - model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model, - processed_models, created_rules) - all_rules.extend(model_rules) - if markdown_block: - look_for_markdown_code_block = True + model_rules, has_special_string = generate_gbnf_grammar(model, processed_models, + created_rules) - if triple_quoted_string: - look_for_triple_quoted_string = True + if not has_special_string: + model_rules[0] += r'"\n" ws "}"' + + all_rules.extend(model_rules) - if not look_for_markdown_code_block and not look_for_triple_quoted_string: - model_rule += ' ws "}"' all_rules.insert(0, root_rule + model_rule + grammar_model_rules) return "\n".join(all_rules) @@ -582,10 +603,10 @@ def get_primitive_grammar(grammar): Returns the needed GBNF primitive grammar for a given GBNF grammar string. Args: - grammar (str): The string containing the GBNF grammar. + grammar (str): The string containing the GBNF grammar. Returns: - str: GBNF primitive grammar string. + str: GBNF primitive grammar string. """ type_list = [] if "string-list" in grammar: @@ -611,7 +632,7 @@ def get_primitive_grammar(grammar): any_block = "" if "custom-class-any" in grammar: - any_block = ''' + any_block = """ value ::= object | array | string | number | boolean | null object ::= @@ -626,7 +647,7 @@ def get_primitive_grammar(grammar): ("," ws value)* )? "]" ws -number ::= integer | float''' +number ::= integer | float""" markdown_code_block_grammar = "" if "markdown-code-block" in grammar: @@ -641,47 +662,140 @@ def get_primitive_grammar(grammar): triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes triple-quoted-string-content ::= ( [^'] | "'" [^'] | "'" "'" [^'] )* triple-quotes ::= "'''" """ - return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar - + return "\n" + "\n".join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar -def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str: - indent = ' ' * depth - field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): " - - # Extracting field description from Pydantic Field using __model_fields__ - field_info = model.model_fields.get(field_name) - field_description = field_info.description if field_info and field_info.description else "No description available." - field_markdown += field_description + '\n' - - # Handling nested BaseModel fields - if isclass(field_type) and issubclass(field_type, BaseModel): - field_markdown += f"{indent} - Details:\n" - for name, type_ in field_type.__annotations__.items(): - field_markdown += generate_field_markdown(name, type_, field_type, depth + 2) +def generate_markdown_documentation( + pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + documentation_with_field_description=True +) -> str: + """ + Generate markdown documentation for a list of Pydantic models. - return field_markdown + Args: + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. + Returns: + str: Generated text documentation. + """ + documentation = "" + pyd_models = [(model, True) for model in pydantic_models] + for model, add_prefix in pyd_models: + if add_prefix: + documentation += f"{model_prefix}: {model.__name__}\n" + else: + documentation += f"Model: {model.__name__}\n" -def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str: - markdown = "" - for model in pydantic_models: - markdown += f"### {format_model_and_field_name(model.__name__)}\n" + # Handling multi-line model description with proper indentation - # Check if the model's docstring is different from BaseModel's docstring class_doc = getdoc(model) base_class_doc = getdoc(BaseModel) - class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available." - - markdown += f"{class_description}\n\n" - markdown += "#### Fields\n" + class_description = class_doc if class_doc and class_doc != base_class_doc else "" + if class_description != "": + documentation += " Description: " + documentation += format_multiline_description(class_description, 0) + "\n" + if add_prefix: + # Indenting the fields section + documentation += f" {fields_prefix}:\n" + else: + documentation += f" Fields:\n" if isclass(model) and issubclass(model, BaseModel): for name, field_type in model.__annotations__.items(): - markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model) - markdown += "\n" + # if name == "markdown_code_block": + # continue + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + if get_origin(field_type) == Union: + element_types = get_args(field_type) + for element_type in element_types: + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + documentation += generate_field_markdown( + name, field_type, model, documentation_with_field_description=documentation_with_field_description + ) + documentation += "\n" + + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" + json_example = json.dumps(model.Config.json_schema_extra["example"]) + documentation += format_multiline_description(json_example, 2) + "\n" - return markdown + return documentation + + +def generate_field_markdown( + field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True +) -> str: + """ + Generate markdown documentation for a Pydantic model field. + + Args: + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation for the field. + """ + indent = " " * depth + + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "" + + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + elif get_origin(field_type) == Union: + element_types = get_args(field_type) + types = [] + for element_type in element_types: + types.append(format_model_and_field_name(element_type.__name__)) + field_text = f"{indent}{field_name} ({' or '.join(types)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + else: + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + + if not documentation_with_field_description: + return field_text + + if field_description != "": + field_text += f" Description: " + field_description + "\n" + + # Check for and include field-specific examples if available + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra["example"].get(field_name) + if field_example is not None: + example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example + field_text += f"{indent} Example: {example_text}\n" + + if isclass(field_type) and issubclass(field_type, BaseModel): + field_text += f"{indent} Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_text += generate_field_markdown(name, type_, field_type, depth + 2) + + return field_text def format_json_example(example: dict, depth: int) -> str: @@ -689,42 +803,44 @@ def format_json_example(example: dict, depth: int) -> str: Format a JSON example into a readable string with indentation. Args: - example (dict): JSON example to be formatted. - depth (int): Indentation depth. + example (dict): JSON example to be formatted. + depth (int): Indentation depth. Returns: - str: Formatted JSON example string. + str: Formatted JSON example string. """ - indent = ' ' * depth - formatted_example = '{\n' + indent = " " * depth + formatted_example = "{\n" for key, value in example.items(): value_text = f"'{value}'" if isinstance(value, str) else value formatted_example += f"{indent}{key}: {value_text},\n" - formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}' + formatted_example = formatted_example.rstrip(",\n") + "\n" + indent + "}" return formatted_example -def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model", - fields_prefix="Fields", documentation_with_field_description=True) -> str: +def generate_text_documentation( + pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + documentation_with_field_description=True +) -> str: """ Generate text documentation for a list of Pydantic models. Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. - model_prefix (str): Prefix for the model section. - fields_prefix (str): Prefix for the fields section. - documentation_with_field_description (bool): Include field descriptions in the documentation. + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - str: Generated text documentation. + str: Generated text documentation. """ documentation = "" pyd_models = [(model, True) for model in pydantic_models] for model, add_prefix in pyd_models: if add_prefix: - documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n" + documentation += f"{model_prefix}: {model.__name__}\n" else: - documentation += f"Model: {format_model_and_field_name(model.__name__)}\n" + documentation += f"Model: {model.__name__}\n" # Handling multi-line model description with proper indentation @@ -735,12 +851,8 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr documentation += " Description: " documentation += "\n" + format_multiline_description(class_description, 2) + "\n" - if add_prefix: - # Indenting the fields section - documentation += f" {fields_prefix}:\n" - else: - documentation += f" Fields:\n" if isclass(model) and issubclass(model, BaseModel): + documentation_fields = "" for name, field_type in model.__annotations__.items(): # if name == "markdown_code_block": # continue @@ -753,35 +865,43 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr for element_type in element_types: if isclass(element_type) and issubclass(element_type, BaseModel): pyd_models.append((element_type, False)) - documentation += generate_field_text(name, field_type, model, - documentation_with_field_description=documentation_with_field_description) + documentation_fields += generate_field_text( + name, field_type, model, documentation_with_field_description=documentation_with_field_description + ) + if documentation_fields != "": + if add_prefix: + documentation += f" {fields_prefix}:\n{documentation_fields}" + else: + documentation += f" Fields:\n{documentation_fields}" documentation += "\n" - if hasattr(model, 'Config') and hasattr(model.Config, - 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" - json_example = json.dumps(model.Config.json_schema_extra['example']) + json_example = json.dumps(model.Config.json_schema_extra["example"]) documentation += format_multiline_description(json_example, 2) + "\n" return documentation -def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, - documentation_with_field_description=True) -> str: +def generate_field_text( + field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True +) -> str: """ Generate text documentation for a Pydantic model field. Args: - field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. - depth (int): Indentation depth in the documentation. - documentation_with_field_description (bool): Include field descriptions in the documentation. + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - str: Generated text documentation for the field. + str: Generated text documentation for the field. """ - indent = ' ' * depth + indent = " " * depth field_info = model.model_fields.get(field_name) field_description = field_info.description if field_info and field_info.description else "" @@ -817,9 +937,9 @@ def generate_field_text(field_name: str, field_type: Type[Any], model: Type[Base field_text += f"{indent} Description: " + field_description + "\n" # Check for and include field-specific examples if available - if hasattr(model, 'Config') and hasattr(model.Config, - 'json_schema_extra') and 'example' in model.Config.json_schema_extra: - field_example = model.Config.json_schema_extra['example'].get(field_name) + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra["example"].get(field_name) if field_example is not None: example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example field_text += f"{indent} Example: {example_text}\n" @@ -837,39 +957,40 @@ def format_multiline_description(description: str, indent_level: int) -> str: Format a multiline description with proper indentation. Args: - description (str): Multiline description. - indent_level (int): Indentation level. + description (str): Multiline description. + indent_level (int): Indentation level. Returns: - str: Formatted multiline description. + str: Formatted multiline description. """ - indent = ' ' * indent_level - return indent + description.replace('\n', '\n' + indent) + indent = " " * indent_level + return indent + description.replace("\n", "\n" + indent) -def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf", - documentation_file_path="./grammar_documentation.md"): +def save_gbnf_grammar_and_documentation( + grammar, documentation, grammar_file_path="./grammar.gbnf", documentation_file_path="./grammar_documentation.md" +): """ Save GBNF grammar and documentation to specified files. Args: - grammar (str): GBNF grammar string. - documentation (str): Documentation string. - grammar_file_path (str): File path to save the GBNF grammar. - documentation_file_path (str): File path to save the documentation. + grammar (str): GBNF grammar string. + documentation (str): Documentation string. + grammar_file_path (str): File path to save the GBNF grammar. + documentation_file_path (str): File path to save the documentation. Returns: - None + None """ try: - with open(grammar_file_path, 'w') as file: + with open(grammar_file_path, "w") as file: file.write(grammar + get_primitive_grammar(grammar)) print(f"Grammar successfully saved to {grammar_file_path}") except IOError as e: print(f"An error occurred while saving the grammar file: {e}") try: - with open(documentation_file_path, 'w') as file: + with open(documentation_file_path, "w") as file: file.write(documentation) print(f"Documentation successfully saved to {documentation_file_path}") except IOError as e: @@ -881,10 +1002,10 @@ def remove_empty_lines(string): Remove empty lines from a string. Args: - string (str): Input string. + string (str): Input string. Returns: - str: String with empty lines removed. + str: String with empty lines removed. """ lines = string.splitlines() non_empty_lines = [line for line in lines if line.strip() != ""] @@ -892,95 +1013,109 @@ def remove_empty_lines(string): return string_no_empty_lines -def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list, - grammar_file_path="./generated_grammar.gbnf", - documentation_file_path="./generated_grammar_documentation.md", - outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", - list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_and_save_gbnf_grammar_and_documentation( + pydantic_model_list, + grammar_file_path="./generated_grammar.gbnf", + documentation_file_path="./generated_grammar_documentation.md", + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation, and save them to specified files. Args: - pydantic_model_list: List of Pydantic model classes. - grammar_file_path (str): File path to save the generated GBNF grammar. - documentation_file_path (str): File path to save the generated documentation. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + pydantic_model_list: List of Pydantic model classes. + grammar_file_path (str): File path to save the generated GBNF grammar. + documentation_file_path (str): File path to save the generated documentation. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - None + None """ - documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + pydantic_model_list, model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar) save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path) -def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_gbnf_grammar_and_documentation( + pydantic_model_list, + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation for a list of Pydantic models. Args: - pydantic_model_list: List of Pydantic model classes. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + pydantic_model_list: List of Pydantic model classes. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - tuple: GBNF grammar string, documentation string. + tuple: GBNF grammar string, documentation string. """ - documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) return grammar, documentation -def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict], - outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", - list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_gbnf_grammar_and_documentation_from_dictionaries( + dictionaries: List[dict], + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing Pydantic models. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + dictionaries (List[dict]): List of dictionaries representing Pydantic models. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - tuple: GBNF grammar string, documentation string. + tuple: GBNF grammar string, documentation string. """ pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries) - documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) return grammar, documentation @@ -990,41 +1125,61 @@ def create_dynamic_model_from_function(func: Callable): Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. Args: - func (Callable): A function with type hints from which to create the model. + func (Callable): A function with type hints from which to create the model. Returns: - A dynamic Pydantic model class with the provided function as a 'run' method. + A dynamic Pydantic model class with the provided function as a 'run' method. """ - # Extracting type hints from the provided function - type_hints = get_type_hints(func) - type_hints.pop('return', None) - # Handling default values and annotations - dynamic_fields = {} - defaults = getattr(func, '__defaults__', ()) or () - defaults_index = len(type_hints) - len(defaults) + # Get the signature of the function + sig = inspect.signature(func) - for index, (name, typ) in enumerate(type_hints.items()): - if index >= defaults_index: - default_value = defaults[index - defaults_index] - dynamic_fields[name] = (typ, default_value) - else: - dynamic_fields[name] = (typ, ...) + # Parse the docstring + docstring = parse(func.__doc__) + dynamic_fields = {} + param_docs = [] + for param in sig.parameters.values(): + # Exclude 'self' parameter + if param.name == "self": + continue + + # Assert that the parameter has a type annotation + if param.annotation == inspect.Parameter.empty: + raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation") + + # Find the parameter's description in the docstring + param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) + + # Assert that the parameter has a description + if not param_doc or not param_doc.description: + raise ValueError( + f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") + + # Add parameter details to the schema + param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) + param_docs.append((param.name, param_doc)) + if param.default == inspect.Parameter.empty: + default_value = ... + else: + default_value = param.default + dynamic_fields[param.name] = ( + param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamicModel = create_model(f'{func.__name__}', **dynamic_fields) + dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) + + for param_doc in param_docs: + dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description - dynamicModel.__doc__ = getdoc(func) + dynamic_model.__doc__ = docstring.short_description - # Wrapping the original function to handle instance 'self' def run_method_wrapper(self): - func_args = {name: getattr(self, name) for name in type_hints} + func_args = {name: getattr(self, name) for name, _ in dynamic_fields.items()} return func(**func_args) # Adding the wrapped function as a 'run' method - setattr(dynamicModel, 'run', run_method_wrapper) - - return dynamicModel + setattr(dynamic_model, "run", run_method_wrapper) + return dynamic_model def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): @@ -1032,11 +1187,11 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): Add a 'run' method to a dynamic Pydantic model, using the provided function. Args: - - model (Type[BaseModel]): Dynamic Pydantic model class. - - func (Callable): Function to be added as a 'run' method to the model. + model (Type[BaseModel]): Dynamic Pydantic model class. + func (Callable): Function to be added as a 'run' method to the model. Returns: - - Type[BaseModel]: Pydantic model class with the added 'run' method. + Type[BaseModel]: Pydantic model class with the added 'run' method. """ def run_method_wrapper(self): @@ -1044,7 +1199,7 @@ def run_method_wrapper(self): return func(**func_args) # Adding the wrapped function as a 'run' method - setattr(model, 'run', run_method_wrapper) + setattr(model, "run", run_method_wrapper) return model @@ -1054,15 +1209,15 @@ def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): Create a list of dynamic Pydantic model classes from a list of dictionaries. Args: - - dictionaries (List[dict]): List of dictionaries representing model structures. + dictionaries (List[dict]): List of dictionaries representing model structures. Returns: - - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. """ dynamic_models = [] for func in dictionaries: model_name = format_model_and_field_name(func.get("name", "")) - dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name) + dyn_model = convert_dictionary_to_pydantic_model(func, model_name) dynamic_models.append(dyn_model) return dynamic_models @@ -1080,12 +1235,12 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list): def json_schema_to_python_types(schema): type_map = { - 'any': Any, - 'string': str, - 'number': float, - 'integer': int, - 'boolean': bool, - 'array': list, + "any": Any, + "string": str, + "number": float, + "integer": int, + "boolean": bool, + "array": list, } return type_map[schema] @@ -1094,58 +1249,64 @@ def list_to_enum(enum_name, values): return Enum(enum_name, {value: value for value in values}) -def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]: +def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]: """ Convert a dictionary to a Pydantic model class. Args: - - dictionary (dict): Dictionary representing the model structure. - - model_name (str): Name of the generated Pydantic model. + dictionary (dict): Dictionary representing the model structure. + model_name (str): Name of the generated Pydantic model. Returns: - - Type[BaseModel]: Generated Pydantic model class. + Type[BaseModel]: Generated Pydantic model class. """ fields = {} if "properties" in dictionary: for field_name, field_data in dictionary.get("properties", {}).items(): - if field_data == 'object': - submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}') + if field_data == "object": + submodel = convert_dictionary_to_pydantic_model(dictionary, f"{model_name}_{field_name}") fields[field_name] = (submodel, ...) else: - field_type = field_data.get('type', 'str') + field_type = field_data.get("type", "str") if field_data.get("enum", []): fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...) - if field_type == "array": + elif field_type == "array": items = field_data.get("items", {}) if items != {}: array = {"properties": items} - array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items') + array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") fields[field_name] = (List[array_type], ...) else: fields[field_name] = (list, ...) - elif field_type == 'object': - submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}') + elif field_type == "object": + submodel = convert_dictionary_to_pydantic_model(field_data, f"{model_name}_{field_name}") fields[field_name] = (submodel, ...) + elif field_type == "required": + required = field_data.get("enum", []) + for key, field in fields.items(): + if key not in required: + fields[key] = (Optional[fields[key][0]], ...) else: field_type = json_schema_to_python_types(field_type) fields[field_name] = (field_type, ...) if "function" in dictionary: - for field_name, field_data in dictionary.get("function", {}).items(): if field_name == "name": model_name = field_data elif field_name == "description": fields["__doc__"] = field_data elif field_name == "parameters": - return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + return convert_dictionary_to_pydantic_model(field_data, f"{model_name}") + if "parameters" in dictionary: field_data = {"function": dictionary} - return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') - + return convert_dictionary_to_pydantic_model(field_data, f"{model_name}") + if "required" in dictionary: + required = dictionary.get("required", []) + for key, field in fields.items(): + if key not in required: + fields[key] = (Optional[fields[key][0]], ...) custom_model = create_model(model_name, **fields) return custom_model - - - From 7c8d3abd1a17c28fc56b1a4814bc4b29f91d7454 Mon Sep 17 00:00:00 2001 From: Alex Azarov Date: Tue, 16 Jan 2024 14:33:02 +0100 Subject: [PATCH 075/131] metal : log `recommendedMaxWorkingSetSize` on iOS 16+ (#4936) * metal: Log `recommendedMaxWorkingSetSize` on iOS 16+ * Only log on iOS and macOS, ignoring tvOS and other platforms * Check for Xcode version before using recommendedMaxWorkingSetSize --------- Co-authored-by: Georgi Gerganov --- ggml-metal.m | 58 ++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 867f2fd48cbd2..44134d1d92494 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -369,8 +369,12 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); -#if TARGET_OS_OSX - GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); + +#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) + if (@available(macOS 10.12, iOS 16.0, *)) { + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); + } +#elif TARGET_OS_OSX if (ctx->device.maxTransferRate != 0) { GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); } else { @@ -2369,6 +2373,25 @@ GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buff UNUSED(buft); } +static void ggml_backend_metal_log_allocated_size(id device) { +#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) + if (@available(macOS 10.12, iOS 16.0, *)) { + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } + } else { + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); + } +#endif + UNUSED(device); +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2401,22 +2424,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff } GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); - - -#if TARGET_OS_OSX - GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", - device.currentAllocatedSize / 1024.0 / 1024.0, - device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - - if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); - } -#else - GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); -#endif - + ggml_backend_metal_log_allocated_size(device); return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } @@ -2524,19 +2532,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, } } -#if TARGET_OS_OSX - GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", - device.currentAllocatedSize / 1024.0 / 1024.0, - device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - - if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); - } -#else - GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); -#endif + ggml_backend_metal_log_allocated_size(device); return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); } From 3a48d558a69c88ac17efcaa5900cd9eb19596ac4 Mon Sep 17 00:00:00 2001 From: Alex Azarov Date: Tue, 16 Jan 2024 14:41:27 +0100 Subject: [PATCH 076/131] metal : replace loop of dispatch_async with dispatch_apply (#4934) * Replace loop of dispatch_async with dispatch_apply * Update ggml-metal.m --------- Co-authored-by: Georgi Gerganov --- ggml-metal.m | 2796 +++++++++++++++++++++++++------------------------- 1 file changed, 1396 insertions(+), 1400 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 44134d1d92494..c21dc465ae50c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -737,1521 +737,1517 @@ static bool ggml_metal_graph_compute( ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; } - for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; - - dispatch_async(ctx->d_queue, ^{ - size_t offs_src0 = 0; - size_t offs_src1 = 0; - size_t offs_dst = 0; - - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = ctx->command_encoders[cb_idx]; - - const int node_start = (cb_idx + 0) * n_nodes_per_cb; - const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); - - for (int ind = node_start; ind < node_end; ++ind) { - const int i = ind; - - if (i == -1) { - [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; - continue; - } - - //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - - struct ggml_tensor * src0 = gf->nodes[i]->src[0]; - struct ggml_tensor * src1 = gf->nodes[i]->src[1]; - struct ggml_tensor * dst = gf->nodes[i]; - - switch (dst->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - { - // noop -> next node - } continue; - default: - { - } break; - } - - if (!ggml_metal_supports_op(ctx, dst)) { - GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); - } - -#ifndef GGML_METAL_NDEBUG - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; -#endif - - const int64_t ne00 = src0 ? src0->ne[0] : 0; - const int64_t ne01 = src0 ? src0->ne[1] : 0; - const int64_t ne02 = src0 ? src0->ne[2] : 0; - const int64_t ne03 = src0 ? src0->ne[3] : 0; - - const uint64_t nb00 = src0 ? src0->nb[0] : 0; - const uint64_t nb01 = src0 ? src0->nb[1] : 0; - const uint64_t nb02 = src0 ? src0->nb[2] : 0; - const uint64_t nb03 = src0 ? src0->nb[3] : 0; - - const int64_t ne10 = src1 ? src1->ne[0] : 0; - const int64_t ne11 = src1 ? src1->ne[1] : 0; - const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); - - const uint64_t nb10 = src1 ? src1->nb[0] : 0; - const uint64_t nb11 = src1 ? src1->nb[1] : 0; - const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); - - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; - - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - - //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} - - switch (dst->op) { - case GGML_OP_CONCAT: - { - const int64_t nb = ne00; - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_DIV: - { - const size_t offs = 0; - - bool bcast_row = false; - - int64_t nb = ne00; + const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) { + const int cb_idx = iter; - id pipeline = nil; + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - GGML_ASSERT(ggml_is_contiguous(src0)); + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = ctx->command_encoders[cb_idx]; - // src1 is a row - GGML_ASSERT(ne11 == 1); + const int node_start = (cb_idx + 0) * n_nodes_per_cb; + const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); - nb = ne00 / 4; - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ASSERT(false); - } + for (int ind = node_start; ind < node_end; ++ind) { + const int i = ind; - bcast_row = true; - } else { - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ASSERT(false); - } - } + if (i == -1) { + [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; + continue; + } - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; - - if (bcast_row) { - const int64_t n = ggml_nelements(dst)/4; + //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + struct ggml_tensor * src0 = gf->nodes[i]->src[0]; + struct ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct ggml_tensor * dst = gf->nodes[i]; + + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop -> next node + } continue; + default: + { + } break; + } - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } else { - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + if (!ggml_metal_supports_op(ctx, dst)) { + GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); + GGML_ASSERT(!"unsupported op"); + } - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } - } break; - case GGML_OP_ACC: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - GGML_ASSERT(dstt == GGML_TYPE_F32); +#ifndef GGML_METAL_NDEBUG + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; +#endif + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; + + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; + + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); + + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; + id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; + + //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //if (src0) { + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} + + switch (dst->op) { + case GGML_OP_CONCAT: + { + const int64_t nb = ne00; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + { + const size_t offs = 0; + + bool bcast_row = false; + + int64_t nb = ne00; + + id pipeline = nil; + + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - const size_t pnb1 = ((int32_t *) dst->op_params)[0]; - const size_t pnb2 = ((int32_t *) dst->op_params)[1]; - const size_t pnb3 = ((int32_t *) dst->op_params)[2]; - const size_t offs = ((int32_t *) dst->op_params)[3]; - - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - // run a separete kernel to cpy src->dst - // not sure how to avoid this - // TODO: make a simpler cpy_bytes kernel - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; + // src1 is a row + GGML_ASSERT(ne11 == 1); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + nb = ne00 / 4; + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; + default: GGML_ASSERT(false); } - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_SCALE: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - const float scale = *(const float *) dst->op_params; - - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - n /= 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; + bcast_row = true; + } else { + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; + default: GGML_ASSERT(false); } + } - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; + + if (bcast_row) { + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(gf->nodes[i])) { - case GGML_UNARY_OP_TANH: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_RELU: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU_QUICK: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_SILU: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); + } else { + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); - } - } break; - case GGML_OP_SQR: - { - GGML_ASSERT(ggml_is_contiguous(src0)); + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + } break; + case GGML_OP_ACC: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + GGML_ASSERT(dstt == GGML_TYPE_F32); - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + const size_t pnb1 = ((int32_t *) dst->op_params)[0]; + const size_t pnb2 = ((int32_t *) dst->op_params)[1]; + const size_t pnb3 = ((int32_t *) dst->op_params)[2]; + const size_t offs = ((int32_t *) dst->op_params)[3]; - const int64_t n = ggml_nelements(dst); + const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SUM_ROWS: - { - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + if (!inplace) { + // run a separete kernel to cpy src->dst + // not sure how to avoid this + // TODO: make a simpler cpy_bytes kernel - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SOFT_MAX: - { - int nth = 32; // SIMD width - - id pipeline = nil; - - if (ne00%4 == 0) { - while (nth < ne00/4 && nth < 256) { - nth *= 2; - } - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; - } else { - while (nth < ne00 && nth < 1024) { - nth *= 2; - } - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; - } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - const float scale = ((float *) dst->op_params)[0]; + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_DIAG_MASK_INF: - { - const int n_past = ((int32_t *)(dst->op_params))[0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } - id pipeline = nil; + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_SCALE: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + const float scale = *(const float *) dst->op_params; + + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + n /= 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; + } - if (ne00%8 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; - } + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(gf->nodes[i])) { + case GGML_UNARY_OP_TANH: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; - if (ne00%8 == 0) { - [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - else { - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - } break; - case GGML_OP_MUL_MAT: - { - GGML_ASSERT(ne00 == ne10); + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - // TODO: assert that dim2 and dim3 are contiguous - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); + const int64_t n = ggml_nelements(dst); - const uint r2 = ne12/ne02; - const uint r3 = ne13/ne03; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_RELU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - int ne11_mm_min = 1; + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; -#if 0 - // the numbers below are measured on M2 Ultra for 7B and 13B models - // these numbers do not translate to other devices or model sizes - // TODO: need to find a better approach - if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { - switch (src0t) { - case GGML_TYPE_F16: ne11_mm_min = 2; break; - case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; - case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; - case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; - case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; - case GGML_TYPE_Q5_0: // not tested yet - case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet - case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; - case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; - default: ne11_mm_min = 1; break; - } - } -#endif + const int64_t n = ggml_nelements(dst); - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - !ggml_is_transposed(src0) && - !ggml_is_transposed(src1) && - src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { - //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); - } + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; - [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - id pipeline = nil; - - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; - nrows = 4; - } break; - case GGML_TYPE_F16: - { - nth0 = 32; - nth1 = 1; - if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; - nrows = ne11; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; - nrows = 4; - } - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; - nrows = 4; - } - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; - } break; - default: - { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ASSERT(false && "not implemented"); - } - }; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - if (ggml_is_quantized(src0t)) { - GGML_ASSERT(ne00 >= nth0*nth1); - } + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; - - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || - src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { - const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q3_K) { -#ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif - } - else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (ne11 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } - } break; - case GGML_OP_MUL_MAT_ID: - { - //GGML_ASSERT(ne00 == ne10); - //GGML_ASSERT(ne03 == ne13); - - GGML_ASSERT(src0t == GGML_TYPE_I32); - - const int n_as = ((int32_t *) dst->op_params)[1]; - - // TODO: make this more general - GGML_ASSERT(n_as <= 8); - - // max size of the src1ids array in the kernel stack - GGML_ASSERT(ne11 <= 512); - - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; - - const int64_t ne20 = src2 ? src2->ne[0] : 0; - const int64_t ne21 = src2 ? src2->ne[1] : 0; - const int64_t ne22 = src2 ? src2->ne[2] : 0; - const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - - const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); - const uint64_t nb21 = src2 ? src2->nb[1] : 0; - const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); - - const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); - - GGML_ASSERT(!ggml_is_transposed(src2)); - GGML_ASSERT(!ggml_is_transposed(src1)); - - GGML_ASSERT(src1t == GGML_TYPE_F32); - - const uint r2 = ne12/ne22; - const uint r3 = ne13/ne23; - - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - int ne11_mm_min = n_as; - - const int idx = ((int32_t *) dst->op_params)[0]; - - // batch size - GGML_ASSERT(ne01 == ne11); - - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - // !!! - // TODO: for now, always use mat-vec kernels until we figure out how to improve the - // indirect matrix multiplication - // !!! - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne20 % 32 == 0 && ne20 >= 64 && - ne11 > ne11_mm_min) { - - id pipeline = nil; - - switch (src2->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); - } + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; - [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; - // TODO: how to make this an array? read Metal docs - for (int j = 0; j < 8; ++j) { - // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 - struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; - - size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); - - [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; - } - - [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - id pipeline = nil; - - // use custom matrix x vector kernel - switch (src2t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; - } break; - case GGML_TYPE_F16: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - nth0 = 32; - nth1 = 1; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; - } break; - default: - { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ASSERT(false && "not implemented"); - } - }; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - if (ggml_is_quantized(src2t)) { - GGML_ASSERT(ne20 >= nth0*nth1); - } + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - const int64_t _ne1 = 1; // kernels needs a reference in constant memory + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SILU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6]; - [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:20]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; - [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; - // TODO: how to make this an array? read Metal docs - for (int j = 0; j < 8; ++j) { - // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 - struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; - - size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); - - [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; - } - - if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || - src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { - const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src2t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src2t == GGML_TYPE_Q3_K) { -#ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif - } - else if (src2t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src2t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (_ne1 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } - } break; - case GGML_OP_GET_ROWS: - { - id pipeline = nil; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; - case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); - } + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; - } break; - case GGML_OP_RMS_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - int nth = 32; // SIMD width - - while (nth < ne00/4 && nth < 1024) { + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + default: + { + GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } + } break; + case GGML_OP_SQR: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SUM_ROWS: + { + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SOFT_MAX: + { + int nth = 32; // SIMD width + + id pipeline = nil; + + if (ne00%4 == 0) { + while (nth < ne00/4 && nth < 256) { nth *= 2; } + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; + } else { + while (nth < ne00 && nth < 1024) { + nth *= 2; + } + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; + } - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_GROUP_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - - //float eps; - //memcpy(&eps, dst->op_params, sizeof(float)); - - const float eps = 1e-6f; // TODO: temporarily hardcoded - - const int32_t n_groups = ((int32_t *) dst->op_params)[0]; - - int nth = 32; // SIMD width - - //while (nth < ne00/4 && nth < 1024) { - // nth *= 2; - //} - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&eps length:sizeof( float) atIndex:9]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_NORM: - { - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - const int nth = MIN(256, ne00); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; + const float scale = ((float *) dst->op_params)[0]; - const int64_t nrows = ggml_nrows(src0); + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (id_src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_DIAG_MASK_INF: + { + const int n_past = ((int32_t *)(dst->op_params))[0]; + + id pipeline = nil; + + if (ne00%8 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; + } - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ALIBI: - { - GGML_ASSERT((src0t == GGML_TYPE_F32)); + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - const int nth = MIN(1024, ne00); + if (ne00%8 == 0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + else { + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + } break; + case GGML_OP_MUL_MAT: + { + GGML_ASSERT(ne00 == ne10); - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_head = ((int32_t *) dst->op_params)[1]; - float max_bias; - memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + // TODO: assert that dim2 and dim3 are contiguous + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + const uint r2 = ne12/ne02; + const uint r3 = ne13/ne03; - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline; + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 1; - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; - [encoder setBytes:&m1 length:sizeof( float) atIndex:19]; - [encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20]; +#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach + if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { + switch (src0t) { + case GGML_TYPE_F16: ne11_mm_min = 2; break; + case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; + case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; + case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; + case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; + case GGML_TYPE_Q5_0: // not tested yet + case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet + case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; + default: ne11_mm_min = 1; break; + } + } +#endif - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ROPE: - { - GGML_ASSERT(ne10 == ne02); - - const int nth = MIN(1024, ne00); - - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal - const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + ne00 % 32 == 0 && ne00 >= 64 && + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); id pipeline = nil; switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break; - default: GGML_ASSERT(false); - }; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; + default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + } [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; - [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; - [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; - [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; - [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; - [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; - [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_IM2COL: - { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int32_t N = src1->ne[is_2D ? 3 : 2]; - const int32_t IC = src1->ne[is_2D ? 2 : 1]; - const int32_t IH = is_2D ? src1->ne[1] : 1; - const int32_t IW = src1->ne[0]; - - const int32_t KH = is_2D ? src0->ne[1] : 1; - const int32_t KW = src0->ne[0]; - - const int32_t OH = is_2D ? dst->ne[2] : 1; - const int32_t OW = dst->ne[1]; - - const int32_t CHW = IC * KH * KW; - - const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; - const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); id pipeline = nil; - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ASSERT(false); + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + nrows = 4; + } break; + case GGML_TYPE_F16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; + nrows = 4; + } + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; + nrows = 4; + } + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; + } break; + default: + { + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_ASSERT(false && "not implemented"); + } }; - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; - [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; - [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; - [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; - [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; - [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; - [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; - [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; - [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; - [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; - - [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; - } break; - case GGML_OP_UPSCALE: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const int sf = dst->op_params[0]; - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + if (ggml_is_quantized(src0t)) { + GGML_ASSERT(ne00 >= nth0*nth1); + } [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; + + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || + src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || + src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { +#ifdef GGML_QKK_64 + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#else + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#endif + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (ne11 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_MUL_MAT_ID: + { + //GGML_ASSERT(ne00 == ne10); + //GGML_ASSERT(ne03 == ne13); - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + GGML_ASSERT(src0t == GGML_TYPE_I32); - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_PAD: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + const int n_as = ((int32_t *) dst->op_params)[1]; - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + // TODO: make this more general + GGML_ASSERT(n_as <= 8); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + // max size of the src1ids array in the kernel stack + GGML_ASSERT(ne11 <= 512); - const int nth = MIN(1024, ne0); + struct ggml_tensor * src2 = gf->nodes[i]->src[2]; - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARGSORT: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; + const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - const int nrows = ggml_nrows(src0); + const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); - id pipeline = nil; + GGML_ASSERT(!ggml_is_transposed(src2)); + GGML_ASSERT(!ggml_is_transposed(src1)); - switch (order) { - case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ASSERT(false); - }; + GGML_ASSERT(src1t == GGML_TYPE_F32); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + const uint r2 = ne12/ne22; + const uint r3 = ne13/ne23; - [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; - } break; - case GGML_OP_LEAKY_RELU: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = n_as; - float slope; - memcpy(&slope, dst->op_params, sizeof(float)); + const int idx = ((int32_t *) dst->op_params)[0]; - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + // batch size + GGML_ASSERT(ne01 == ne11); - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + // !!! + // TODO: for now, always use mat-vec kernels until we figure out how to improve the + // indirect matrix multiplication + // !!! + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne20 % 32 == 0 && ne20 >= 64 && + ne11 > ne11_mm_min) { + + id pipeline = nil; - const int64_t n = ggml_nelements(dst); + switch (src2->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; + default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); + } - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - { - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; + // TODO: how to make this an array? read Metal docs + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; + + size_t offs_src_cur = 0; + id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; + } + + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); + [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); id pipeline = nil; - switch (src0t) { + // use custom matrix x vector kernel + switch (src2t) { case GGML_TYPE_F32: { - GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); - - switch (dstt) { - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; - //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; - //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); - }; + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; } break; case GGML_TYPE_F16: { - switch (dstt) { - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); - }; + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; } break; - default: GGML_ASSERT(false && "not implemented"); + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; + } break; + default: + { + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); + GGML_ASSERT(false && "not implemented"); + } + }; + + if (ggml_is_quantized(src2t)) { + GGML_ASSERT(ne20 >= nth0*nth1); } + const int64_t _ne1 = 1; // kernels needs a reference in constant memory + [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6]; + [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:20]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; + // TODO: how to make this an array? read Metal docs + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; + + size_t offs_src_cur = 0; + id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; + } - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || + src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || + src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { + const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q3_K) { +#ifdef GGML_QKK_64 + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#else + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#endif + } + else if (src2t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (_ne1 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_GET_ROWS: + { + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; + default: GGML_ASSERT(false && "not implemented"); } - } -#ifndef GGML_METAL_NDEBUG - [encoder popDebugGroup]; -#endif - } + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + } break; + case GGML_OP_RMS_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + int nth = 32; // SIMD width + + while (nth < ne00/4 && nth < 1024) { + nth *= 2; + } + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_GROUP_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + + //float eps; + //memcpy(&eps, dst->op_params, sizeof(float)); + + const float eps = 1e-6f; // TODO: temporarily hardcoded + + const int32_t n_groups = ((int32_t *) dst->op_params)[0]; + + int nth = 32; // SIMD width + + //while (nth < ne00/4 && nth < 1024) { + // nth *= 2; + //} + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&eps length:sizeof( float) atIndex:9]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_NORM: + { + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + const int nth = MIN(256, ne00); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ALIBI: + { + GGML_ASSERT((src0t == GGML_TYPE_F32)); + + const int nth = MIN(1024, ne00); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; + [encoder setBytes:&m1 length:sizeof( float) atIndex:19]; + [encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ROPE: + { + GGML_ASSERT(ne10 == ne02); + + const int nth = MIN(1024, ne00); + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break; + default: GGML_ASSERT(false); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_IM2COL: + { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int32_t N = src1->ne[is_2D ? 3 : 2]; + const int32_t IC = src1->ne[is_2D ? 2 : 1]; + const int32_t IH = is_2D ? src1->ne[1] : 1; + const int32_t IW = src1->ne[0]; + + const int32_t KH = is_2D ? src0->ne[1] : 1; + const int32_t KW = src0->ne[0]; + + const int32_t OH = is_2D ? dst->ne[2] : 1; + const int32_t OW = dst->ne[1]; + + const int32_t CHW = IC * KH * KW; + + const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; + const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; + default: GGML_ASSERT(false); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; + [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; + [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; + [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; + [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; + [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; + [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; + [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; + [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; + [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; + + [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; + } break; + case GGML_OP_UPSCALE: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int sf = dst->op_params[0]; + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ARGSORT: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + id pipeline = nil; + + switch (order) { + case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + default: GGML_ASSERT(false); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; + } break; + case GGML_OP_LEAKY_RELU: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + float slope; + memcpy(&slope, dst->op_params, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + { + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); + + switch (dstt) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; + //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; + //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; + default: GGML_ASSERT(false && "not implemented"); + } - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + default: + { + GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } } - [command_buffer commit]; - }); - } +#ifndef GGML_METAL_NDEBUG + [encoder popDebugGroup]; +#endif + } + + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } - // wait for all threads to finish - dispatch_barrier_sync(ctx->d_queue, ^{}); + [command_buffer commit]; + }); // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) From 862f5e41ab1fdf12d6f59455aad3f5dd8258f805 Mon Sep 17 00:00:00 2001 From: Neuman Vong Date: Wed, 17 Jan 2024 00:47:34 +1100 Subject: [PATCH 077/131] android : introduce starter project example (#4926) * Introduce starter project for Android Based on examples/llama.swiftui. * Add github workflow * Set NDK version * Only build arm64-v8a in CI * Sync bench code * Rename CI prop to skip-armeabi-v7a * Remove unused tests --- .github/workflows/build.yml | 25 ++ examples/llama.android/.gitignore | 33 ++ examples/llama.android/README.md | 0 examples/llama.android/app/.gitignore | 1 + examples/llama.android/app/build.gradle.kts | 91 ++++ examples/llama.android/app/proguard-rules.pro | 21 + .../app/src/main/AndroidManifest.xml | 30 ++ .../app/src/main/cpp/CMakeLists.txt | 50 +++ .../app/src/main/cpp/llama-android.cpp | 394 ++++++++++++++++++ .../java/com/example/llama/Downloadable.kt | 119 ++++++ .../src/main/java/com/example/llama/Llm.kt | 172 ++++++++ .../java/com/example/llama/MainActivity.kt | 154 +++++++ .../java/com/example/llama/MainViewModel.kt | 104 +++++ .../java/com/example/llama/ui/theme/Color.kt | 11 + .../java/com/example/llama/ui/theme/Theme.kt | 70 ++++ .../java/com/example/llama/ui/theme/Type.kt | 34 ++ .../res/drawable/ic_launcher_background.xml | 170 ++++++++ .../res/drawable/ic_launcher_foreground.xml | 30 ++ .../main/res/mipmap-anydpi/ic_launcher.xml | 6 + .../res/mipmap-anydpi/ic_launcher_round.xml | 6 + .../src/main/res/mipmap-hdpi/ic_launcher.webp | Bin 0 -> 1404 bytes .../res/mipmap-hdpi/ic_launcher_round.webp | Bin 0 -> 2898 bytes .../src/main/res/mipmap-mdpi/ic_launcher.webp | Bin 0 -> 982 bytes .../res/mipmap-mdpi/ic_launcher_round.webp | Bin 0 -> 1772 bytes .../main/res/mipmap-xhdpi/ic_launcher.webp | Bin 0 -> 1900 bytes .../res/mipmap-xhdpi/ic_launcher_round.webp | Bin 0 -> 3918 bytes .../main/res/mipmap-xxhdpi/ic_launcher.webp | Bin 0 -> 2884 bytes .../res/mipmap-xxhdpi/ic_launcher_round.webp | Bin 0 -> 5914 bytes .../main/res/mipmap-xxxhdpi/ic_launcher.webp | Bin 0 -> 3844 bytes .../res/mipmap-xxxhdpi/ic_launcher_round.webp | Bin 0 -> 7778 bytes .../app/src/main/res/values/colors.xml | 10 + .../app/src/main/res/values/strings.xml | 3 + .../app/src/main/res/values/themes.xml | 5 + .../app/src/main/res/xml/backup_rules.xml | 13 + .../main/res/xml/data_extraction_rules.xml | 19 + examples/llama.android/build.gradle.kts | 5 + examples/llama.android/gradle.properties | 23 + .../gradle/wrapper/gradle-wrapper.jar | Bin 0 -> 59203 bytes .../gradle/wrapper/gradle-wrapper.properties | 6 + examples/llama.android/gradlew | 185 ++++++++ examples/llama.android/settings.gradle.kts | 17 + 41 files changed, 1807 insertions(+) create mode 100644 examples/llama.android/.gitignore create mode 100644 examples/llama.android/README.md create mode 100644 examples/llama.android/app/.gitignore create mode 100644 examples/llama.android/app/build.gradle.kts create mode 100644 examples/llama.android/app/proguard-rules.pro create mode 100644 examples/llama.android/app/src/main/AndroidManifest.xml create mode 100644 examples/llama.android/app/src/main/cpp/CMakeLists.txt create mode 100644 examples/llama.android/app/src/main/cpp/llama-android.cpp create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Llm.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp create mode 100644 examples/llama.android/app/src/main/res/values/colors.xml create mode 100644 examples/llama.android/app/src/main/res/values/strings.xml create mode 100644 examples/llama.android/app/src/main/res/values/themes.xml create mode 100644 examples/llama.android/app/src/main/res/xml/backup_rules.xml create mode 100644 examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml create mode 100644 examples/llama.android/build.gradle.kts create mode 100644 examples/llama.android/gradle.properties create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.jar create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.properties create mode 100755 examples/llama.android/gradlew create mode 100644 examples/llama.android/settings.gradle.kts diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0a28a11112251..367df07a7e497 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -515,6 +515,31 @@ jobs: - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + android-build: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v3 + + - name: Set up JDK + uses: actions/setup-java@v3 + with: + java-version: 17 + distribution: zulu + + - name: Setup Android SDK + uses: android-actions/setup-android@v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Build + run: | + cd examples/llama.android + + # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820). + ./gradlew build --no-daemon -Pskip-armeabi-v7a + # freeBSD-latest: # runs-on: macos-12 # steps: diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore new file mode 100644 index 0000000000000..347e252ef10e9 --- /dev/null +++ b/examples/llama.android/.gitignore @@ -0,0 +1,33 @@ +# Gradle files +.gradle/ +build/ + +# Local configuration file (sdk path, etc) +local.properties + +# Log/OS Files +*.log + +# Android Studio generated files and folders +captures/ +.externalNativeBuild/ +.cxx/ +*.apk +output.json + +# IntelliJ +*.iml +.idea/ +misc.xml +deploymentTargetDropDown.xml +render.experimental.xml + +# Keystore files +*.jks +*.keystore + +# Google Services (e.g. APIs or Firebase) +google-services.json + +# Android Profiling +*.hprof diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore new file mode 100644 index 0000000000000..796b96d1c4023 --- /dev/null +++ b/examples/llama.android/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts new file mode 100644 index 0000000000000..7815a802593ba --- /dev/null +++ b/examples/llama.android/app/build.gradle.kts @@ -0,0 +1,91 @@ +plugins { + id("com.android.application") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "com.example.llama" + compileSdk = 34 + + ndkVersion = "26.1.10909125" + + defaultConfig { + applicationId = "com.example.llama" + minSdk = 33 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary = true + } + ndk { + // Workaround for https://github.com/llvm/llvm-project/issues/65820 + // affecting armeabi-v7a. Skip armeabi-v7a when invoked with + // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). + if (project.hasProperty("skip-armeabi-v7a")) { + abiFilters += listOf("arm64-v8a", "x86_64", "x86") + } + } + externalNativeBuild { + cmake { + cppFlags += listOf() + arguments += listOf() + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + buildFeatures { + compose = true + } + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } +} + +dependencies { + + implementation("androidx.core:core-ktx:1.12.0") + implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") + implementation("androidx.activity:activity-compose:1.8.2") + implementation(platform("androidx.compose:compose-bom:2023.08.00")) + implementation("androidx.compose.ui:ui") + implementation("androidx.compose.ui:ui-graphics") + implementation("androidx.compose.ui:ui-tooling-preview") + implementation("androidx.compose.material3:material3") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.1.5") + androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") + androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) + androidTestImplementation("androidx.compose.ui:ui-test-junit4") + debugImplementation("androidx.compose.ui:ui-tooling") + debugImplementation("androidx.compose.ui:ui-test-manifest") +} diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro new file mode 100644 index 0000000000000..f1b424510da51 --- /dev/null +++ b/examples/llama.android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml new file mode 100644 index 0000000000000..41a358a299154 --- /dev/null +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt new file mode 100644 index 0000000000000..85139329aa082 --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt @@ -0,0 +1,50 @@ + +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html. +# For more examples on how to use CMake, see https://github.com/android/ndk-samples. + +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, +# Since this is the top level CMakeLists.txt, the project name is also accessible +# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level +# build script scope). +project("llama-android") + +include(FetchContent) +FetchContent_Declare( + llama + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG master +) + +# Also provides "common" +FetchContent_MakeAvailable(llama) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. +# +# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define +# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} +# is preferred for the same purpose. +# +# In order to load a library into your app from Java/Kotlin, you must call +# System.loadLibrary() and pass the name of the library defined here; +# for GameActivity/NativeActivity derived applications, the same library name must be +# used in the AndroidManifest.xml file. +add_library(${CMAKE_PROJECT_NAME} SHARED + # List C/C++ source files with relative paths to this CMakeLists.txt. + llama-android.cpp) + +# Specifies libraries CMake should link to your target library. You +# can link libraries from various origins, such as libraries defined in this +# build script, prebuilt third-party libraries, or Android system libraries. +target_link_libraries(${CMAKE_PROJECT_NAME} + # List libraries link to the target library + llama + common + android + log) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp new file mode 100644 index 0000000000000..d5e705dce6ca0 --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include +#include +#include "llama.h" +#include "common/common.h" + +// Write C++ code here. +// +// Do not forget to dynamically load the C++ library into your application. +// +// For instance, +// +// In MainActivity.java: +// static { +// System.loadLibrary("llama-android"); +// } +// +// Or, in MainActivity.kt: +// companion object { +// init { +// System.loadLibrary("llama-android") +// } +// } + +#define TAG "llama-android.cpp" +#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) +#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__) + +jclass la_int_var; +jmethodID la_int_var_value; +jmethodID la_int_var_inc; + +static void log_callback(ggml_log_level level, const char * fmt, void * data) { + if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data); + else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { + llama_model_params model_params = llama_model_default_params(); + + auto path_to_model = env->GetStringUTFChars(filename, 0); + LOGi("Loading model from %s", path_to_model); + + auto model = llama_load_model_from_file(path_to_model, model_params); + env->ReleaseStringUTFChars(filename, path_to_model); + + if (!model) { + LOGe("load_model() failed"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed"); + return 0; + } + + return reinterpret_cast(model); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { + llama_free_model(reinterpret_cast(model)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { + auto model = reinterpret_cast(jmodel); + + if (!model) { + LOGe("new_context(): model cannot be null"); + env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null"); + return 0; + } + + int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2)); + LOGi("Using %d threads", n_threads); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; + ctx_params.n_threads = n_threads; + ctx_params.n_threads_batch = n_threads; + + llama_context * context = llama_new_context_with_model(model, ctx_params); + + if (!context) { + LOGe("llama_new_context_with_model() returned null)"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), + "llama_new_context_with_model() returned null)"); + return 0; + } + + return reinterpret_cast(context); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { + llama_free(reinterpret_cast(context)); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { + llama_backend_free(); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { + llama_log_set(log_callback, NULL); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_bench_1model( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong model_pointer, + jlong batch_pointer, + jint pp, + jint tg, + jint pl, + jint nr + ) { + auto pp_avg = 0.0; + auto tg_avg = 0.0; + auto pp_std = 0.0; + auto tg_std = 0.0; + + const auto context = reinterpret_cast(context_pointer); + const auto model = reinterpret_cast(model_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const int n_ctx = llama_n_ctx(context); + + LOGi("n_ctx = %d", n_ctx); + + int i, j; + int nri; + for (nri = 0; nri < nr; nri++) { + LOGi("Benchmark prompt processing (pp)"); + + llama_batch_clear(*batch); + + const int n_tokens = pp; + for (i = 0; i < n_tokens; i++) { + llama_batch_add(*batch, 0, i, { 0 }, false); + } + + batch->logits[batch->n_tokens - 1] = true; + llama_kv_cache_clear(context); + + const auto t_pp_start = ggml_time_us(); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during prompt processing"); + } + const auto t_pp_end = ggml_time_us(); + + // bench text generation + + LOGi("Benchmark text generation (tg)"); + + llama_kv_cache_clear(context); + const auto t_tg_start = ggml_time_us(); + for (i = 0; i < tg; i++) { + + llama_batch_clear(*batch); + for (j = 0; j < pl; j++) { + llama_batch_add(*batch, 0, i, { j }, true); + } + + LOGi("llama_decode() text generation: %d", i); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during text generation"); + } + } + + const auto t_tg_end = ggml_time_us(); + + llama_kv_cache_clear(context); + + const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; + const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; + + const auto speed_pp = double(pp) / t_pp; + const auto speed_tg = double(pl * tg) / t_tg; + + pp_avg += speed_pp; + tg_avg += speed_tg; + + pp_std += speed_pp * speed_pp; + tg_std += speed_tg * speed_tg; + + LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg); + } + + pp_avg /= double(nr); + tg_avg /= double(nr); + + if (nr > 1) { + pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1)); + tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1)); + } else { + pp_std = 0; + tg_std = 0; + } + + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + + const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0; + const auto model_n_params = double(llama_model_n_params(model)) / 1e9; + + const auto backend = "(Android)"; // TODO: What should this be? + + std::stringstream result; + result << std::setprecision(2); + result << "| model | size | params | backend | test | t/s |\n"; + result << "| --- | --- | --- | --- | --- | --- |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n"; + + return env->NewStringUTF(result.str().c_str()); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { + + // Source: Copy of llama.cpp:llama_batch_init but heap-allocated. + + llama_batch *batch = new llama_batch { + 0, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + 0, + 0, + 0, + }; + + if (embd) { + batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + } + + batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); + for (int i = 0; i < n_tokens; ++i) { + batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); + } + batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return reinterpret_cast(batch); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) { + llama_backend_init(numa); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { + return env->NewStringUTF(llama_print_system_info()); +} + +extern "C" +JNIEXPORT jint JNICALL +Java_com_example_llama_Llm_completion_1init( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jstring jtext, + jint n_len + ) { + + const auto text = env->GetStringUTFChars(jtext, 0); + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const auto tokens_list = llama_tokenize(context, text, 1); + + auto n_ctx = llama_n_ctx(context); + auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + + LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req); + + if (n_kv_req > n_ctx) { + LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough"); + } + + for (auto id : tokens_list) { + LOGi("%s", llama_token_to_piece(context, id).c_str()); + } + + llama_batch_clear(*batch); + + // evaluate the initial prompt + for (auto i = 0; i < tokens_list.size(); i++) { + llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + } + + // llama_decode will output logits only for the last token of the prompt + batch->logits[batch->n_tokens - 1] = true; + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() failed"); + } + + env->ReleaseStringUTFChars(jtext, text); + + return batch->n_tokens; +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_completion_1loop( + JNIEnv * env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jint n_len, + jobject intvar_ncur +) { + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto model = llama_get_model(context); + + if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); + if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); + if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); + + auto n_vocab = llama_n_vocab(model); + auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); + + const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + return env->NewStringUTF(""); + } + + auto new_token_chars = llama_token_to_piece(context, new_token_id); + LOGi("new_token_chars: `%s`", new_token_chars.c_str()); + auto new_token = env->NewStringUTF(new_token_chars.c_str()); + + llama_batch_clear(*batch); + llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + + env->CallVoidMethod(intvar_ncur, la_int_var_inc); + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() returned null"); + } + + return new_token; +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { + llama_kv_cache_clear(reinterpret_cast(context)); +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt new file mode 100644 index 0000000000000..78c231ae55d8c --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt @@ -0,0 +1,119 @@ +package com.example.llama + +import android.app.DownloadManager +import android.net.Uri +import android.util.Log +import androidx.compose.material3.Button +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableDoubleStateOf +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.setValue +import androidx.core.database.getLongOrNull +import androidx.core.net.toUri +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import java.io.File + +data class Downloadable(val name: String, val source: Uri, val destination: File) { + companion object { + @JvmStatic + private val tag: String? = this::class.qualifiedName + + sealed interface State + data object Ready: State + data class Downloading(val id: Long): State + data class Downloaded(val downloadable: Downloadable): State + data class Error(val message: String): State + + @JvmStatic + @Composable + fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { + var status: State by remember { + mutableStateOf( + if (item.destination.exists()) Downloaded(item) + else Ready + ) + } + var progress by remember { mutableDoubleStateOf(0.0) } + + val coroutineScope = rememberCoroutineScope() + + suspend fun waitForDownload(result: Downloading, item: Downloadable): State { + while (true) { + val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) + + if (cursor == null) { + Log.e(tag, "dm.query() returned null") + return Error("dm.query() returned null") + } + + if (!cursor.moveToFirst() || cursor.count < 1) { + cursor.close() + Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") + return Ready + } + + val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) + val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) + val sofar = cursor.getLongOrNull(pix) ?: 0 + val total = cursor.getLongOrNull(tix) ?: 1 + cursor.close() + + if (sofar == total) { + return Downloaded(item) + } + + progress = (sofar * 1.0) / total + + delay(1000L) + } + } + + fun onClick() { + when (val s = status) { + is Downloaded -> { + viewModel.load(item.destination.path) + } + + is Downloading -> { + coroutineScope.launch { + status = waitForDownload(s, item) + } + } + + else -> { + item.destination.delete() + + val request = DownloadManager.Request(item.source).apply { + setTitle("Downloading model") + setDescription("Downloading model: ${item.name}") + setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) + setDestinationUri(item.destination.toUri()) + } + + viewModel.log("Saving ${item.name} to ${item.destination.path}") + Log.i(tag, "Saving ${item.name} to ${item.destination.path}") + + val id = dm.enqueue(request) + status = Downloading(id) + onClick() + } + } + } + + Button(onClick = { onClick() }, enabled = status !is Downloading) { + when (status) { + is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") + is Downloaded -> Text("Load ${item.name}") + is Ready -> Text("Download ${item.name}") + is Error -> Text("Download ${item.name}") + } + } + } + + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt new file mode 100644 index 0000000000000..5f32703724a49 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt @@ -0,0 +1,172 @@ +package com.example.llama + +import android.util.Log +import kotlinx.coroutines.CoroutineDispatcher +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.flow.flowOn +import kotlinx.coroutines.withContext +import java.util.concurrent.Executors +import kotlin.concurrent.thread + +class Llm { + private val tag: String? = this::class.simpleName + + private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle } + + private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor { + thread(start = false, name = "Llm-RunLoop") { + Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}") + + // No-op if called more than once. + System.loadLibrary("llama-android") + + // Set llama log handler to Android + log_to_android() + backend_init(false) + + Log.d(tag, system_info()) + + it.run() + }.apply { + uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable -> + Log.e(tag, "Unhandled exception", exception) + } + } + }.asCoroutineDispatcher() + + private val nlen: Int = 64 + + private external fun log_to_android() + private external fun load_model(filename: String): Long + private external fun free_model(model: Long) + private external fun new_context(model: Long): Long + private external fun free_context(context: Long) + private external fun backend_init(numa: Boolean) + private external fun backend_free() + private external fun free_batch(batch: Long) + private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun bench_model( + context: Long, + model: Long, + batch: Long, + pp: Int, + tg: Int, + pl: Int, + nr: Int + ): String + + private external fun system_info(): String + + private external fun completion_init( + context: Long, + batch: Long, + text: String, + nLen: Int + ): Int + + private external fun completion_loop( + context: Long, + batch: Long, + nLen: Int, + ncur: IntVar + ): String + + private external fun kv_cache_clear(context: Long) + + suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String { + return withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + Log.d(tag, "bench(): $state") + bench_model(state.context, state.model, state.batch, pp, tg, pl, nr) + } + + else -> throw IllegalStateException("No model loaded") + } + } + } + + suspend fun load(pathToModel: String) { + withContext(runLoop) { + when (threadLocalState.get()) { + is State.Idle -> { + val model = load_model(pathToModel) + if (model == 0L) throw IllegalStateException("load_model() failed") + + val context = new_context(model) + if (context == 0L) throw IllegalStateException("new_context() failed") + + val batch = new_batch(512, 0, 1) + if (batch == 0L) throw IllegalStateException("new_batch() failed") + + Log.i(tag, "Loaded model $pathToModel") + threadLocalState.set(State.Loaded(model, context, batch)) + } + else -> throw IllegalStateException("Model already loaded") + } + } + } + + fun send(message: String): Flow = flow { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) + while (ncur.value <= nlen) { + val str = completion_loop(state.context, state.batch, nlen, ncur) + if (str.isEmpty()) { + break + } + emit(str) + } + kv_cache_clear(state.context) + } + else -> {} + } + }.flowOn(runLoop) + + /** + * Unloads the model and frees resources. + * + * This is a no-op if there's no model loaded. + */ + suspend fun unload() { + withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + free_context(state.context) + free_model(state.model) + free_batch(state.batch) + + threadLocalState.set(State.Idle) + } + else -> {} + } + } + } + + companion object { + private class IntVar(value: Int) { + @Volatile + var value: Int = value + private set + + fun inc() { + synchronized(this) { + value += 1 + } + } + } + + private sealed interface State { + data object Idle: State + data class Loaded(val model: Long, val context: Long, val batch: Long): State + } + + // Enforce only one instance of Llm. + private val _instance: Llm = Llm() + + fun instance(): Llm = _instance + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt new file mode 100644 index 0000000000000..9da04f7d3c32e --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -0,0 +1,154 @@ +package com.example.llama + +import android.app.ActivityManager +import android.app.DownloadManager +import android.content.ClipData +import android.content.ClipboardManager +import android.net.Uri +import android.os.Bundle +import android.os.StrictMode +import android.os.StrictMode.VmPolicy +import android.text.format.Formatter +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.viewModels +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.lazy.LazyColumn +import androidx.compose.foundation.lazy.items +import androidx.compose.foundation.lazy.rememberLazyListState +import androidx.compose.material3.Button +import androidx.compose.material3.LocalContentColor +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Surface +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.unit.dp +import androidx.core.content.getSystemService +import com.example.llama.ui.theme.LlamaAndroidTheme +import java.io.File + +class MainActivity( + activityManager: ActivityManager? = null, + downloadManager: DownloadManager? = null, + clipboardManager: ClipboardManager? = null, +): ComponentActivity() { + private val tag: String? = this::class.simpleName + + private val activityManager by lazy { activityManager ?: getSystemService()!! } + private val downloadManager by lazy { downloadManager ?: getSystemService()!! } + private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + + private val viewModel: MainViewModel by viewModels() + + // Get a MemoryInfo object for the device's current memory status. + private fun availableMemory(): ActivityManager.MemoryInfo { + return ActivityManager.MemoryInfo().also { memoryInfo -> + activityManager.getMemoryInfo(memoryInfo) + } + } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + + StrictMode.setVmPolicy( + VmPolicy.Builder(StrictMode.getVmPolicy()) + .detectLeakedClosableObjects() + .build() + ) + + val free = Formatter.formatFileSize(this, availableMemory().availMem) + val total = Formatter.formatFileSize(this, availableMemory().totalMem) + + viewModel.log("Current memory: $free / $total") + viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") + + val extFilesDir = getExternalFilesDir(null) + + val models = listOf( + Downloadable( + "Phi-2 7B (Q4_0, 1.6 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), + File(extFilesDir, "phi-2-q4_0.gguf"), + ), + Downloadable( + "TinyLlama 1.1B (f16, 2.2 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), + File(extFilesDir, "tinyllama-1.1-f16.gguf"), + ), + Downloadable( + "Phi 2 DPO (Q3_K_M, 1.48 GiB)", + Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), + File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") + ), + ) + + setContent { + LlamaAndroidTheme { + // A surface container using the 'background' color from the theme + Surface( + modifier = Modifier.fillMaxSize(), + color = MaterialTheme.colorScheme.background + ) { + MainCompose( + viewModel, + clipboardManager, + downloadManager, + models, + ) + } + + } + } + } +} + +@Composable +fun MainCompose( + viewModel: MainViewModel, + clipboard: ClipboardManager, + dm: DownloadManager, + models: List +) { + Column { + val scrollState = rememberLazyListState() + + Box(modifier = Modifier.weight(1f)) { + LazyColumn(state = scrollState) { + items(viewModel.messages) { + Text( + it, + style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), + modifier = Modifier.padding(16.dp) + ) + } + } + } + OutlinedTextField( + value = viewModel.message, + onValueChange = { viewModel.updateMessage(it) }, + label = { Text("Message") }, + ) + Row { + Button({ viewModel.send() }) { Text("Send") } + Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } + Button({ viewModel.clear() }) { Text("Clear") } + Button({ + viewModel.messages.joinToString("\n").let { + clipboard.setPrimaryClip(ClipData.newPlainText("", it)) + } + }) { Text("Copy") } + } + + Column { + for (model in models) { + Downloadable.Button(viewModel, dm, model) + } + } + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt new file mode 100644 index 0000000000000..be95e22218332 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt @@ -0,0 +1,104 @@ +package com.example.llama + +import android.util.Log +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.setValue +import androidx.lifecycle.ViewModel +import androidx.lifecycle.viewModelScope +import kotlinx.coroutines.flow.catch +import kotlinx.coroutines.launch + +class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { + companion object { + @JvmStatic + private val NanosPerSecond = 1_000_000_000.0 + } + + private val tag: String? = this::class.simpleName + + var messages by mutableStateOf(listOf("Initializing...")) + private set + + var message by mutableStateOf("") + private set + + override fun onCleared() { + super.onCleared() + + viewModelScope.launch { + try { + llm.unload() + } catch (exc: IllegalStateException) { + messages += exc.message!! + } + } + } + + fun send() { + val text = message + message = "" + + // Add to messages console. + messages += text + messages += "" + + viewModelScope.launch { + llm.send(text) + .catch { + Log.e(tag, "send() failed", it) + messages += it.message!! + } + .collect { messages = messages.dropLast(1) + (messages.last() + it) } + } + } + + fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { + viewModelScope.launch { + try { + val start = System.nanoTime() + val warmupResult = llm.bench(pp, tg, pl, nr) + val end = System.nanoTime() + + messages += warmupResult + + val warmup = (end - start).toDouble() / NanosPerSecond + messages += "Warm up time: $warmup seconds, please wait..." + + if (warmup > 5.0) { + messages += "Warm up took too long, aborting benchmark" + return@launch + } + + messages += llm.bench(512, 128, 1, 3) + } catch (exc: IllegalStateException) { + Log.e(tag, "bench() failed", exc) + messages += exc.message!! + } + } + } + + fun load(pathToModel: String) { + viewModelScope.launch { + try { + llm.load(pathToModel) + messages += "Loaded $pathToModel" + } catch (exc: IllegalStateException) { + Log.e(tag, "load() failed", exc) + messages += exc.message!! + } + } + } + + fun updateMessage(newMessage: String) { + message = newMessage + } + + fun clear() { + messages = listOf() + } + + fun log(message: String) { + messages += message + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt new file mode 100644 index 0000000000000..40c30e8d97077 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.example.llama.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt new file mode 100644 index 0000000000000..e742220a8d719 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt @@ -0,0 +1,70 @@ +package com.example.llama.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.runtime.SideEffect +import androidx.compose.ui.graphics.toArgb +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.platform.LocalView +import androidx.core.view.WindowCompat + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun LlamaAndroidTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + val view = LocalView.current + if (!view.isInEditMode) { + SideEffect { + val window = (view.context as Activity).window + window.statusBarColor = colorScheme.primary.toArgb() + WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme + } + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt new file mode 100644 index 0000000000000..0b87946ca3ab1 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.example.llama.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 0000000000000..07d5da9cbf141 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml new file mode 100644 index 0000000000000..7706ab9e6d407 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml new file mode 100644 index 0000000000000..b3e26b4c60c27 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml new file mode 100644 index 0000000000000..b3e26b4c60c27 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..c209e78ecd372343283f4157dcfd918ec5165bb3 GIT binary patch literal 1404 zcmV-?1%vuhNk&F=1pok7MM6+kP&il$0000G0000-002h-06|PpNX!5L00Dqw+t%{r zzW2vH!KF=w&cMnnN@{whkTw+#mAh0SV?YL=)3MimFYCWp#fpdtz~8$hD5VPuQgtcN zXl<@<#Cme5f5yr2h%@8TWh?)bSK`O z^Z@d={gn7J{iyxL_y_%J|L>ep{dUxUP8a{byupH&!UNR*OutO~0{*T4q5R6@ApLF! z5{w?Z150gC7#>(VHFJZ-^6O@PYp{t!jH(_Z*nzTK4 zkc{fLE4Q3|mA2`CWQ3{8;gxGizgM!zccbdQoOLZc8hThi-IhN90RFT|zlxh3Ty&VG z?Fe{#9RrRnxzsu|Lg2ddugg7k%>0JeD+{XZ7>Z~{=|M+sh1MF7~ zz>To~`~LVQe1nNoR-gEzkpe{Ak^7{{ZBk2i_<+`Bq<^GB!RYG+z)h;Y3+<{zlMUYd zrd*W4w&jZ0%kBuDZ1EW&KLpyR7r2=}fF2%0VwHM4pUs}ZI2egi#DRMYZPek*^H9YK zay4Iy3WXFG(F14xYsoDA|KXgGc5%2DhmQ1gFCkrgHBm!lXG8I5h*uf{rn48Z!_@ z4Bk6TJAB2CKYqPjiX&mWoW>OPFGd$wqroa($ne7EUK;#3VYkXaew%Kh^3OrMhtjYN?XEoY`tRPQsAkH-DSL^QqyN0>^ zmC>{#F14jz4GeW{pJoRpLFa_*GI{?T93^rX7SPQgT@LbLqpNA}<@2wH;q493)G=1Y z#-sCiRNX~qf3KgiFzB3I>4Z%AfS(3$`-aMIBU+6?gbgDb!)L~A)je+;fR0jWLL-Fu z4)P{c7{B4Hp91&%??2$v9iRSFnuckHUm}or9seH6 z>%NbT+5*@L5(I9j@06@(!{ZI?U0=pKn8uwIg&L{JV14+8s2hnvbRrU|hZCd}IJu7*;;ECgO%8_*W Kmw_-CKmY()leWbG literal 0 HcmV?d00001 diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 0000000000000000000000000000000000000000..b2dfe3d1ba5cf3ee31b3ecc1ced89044a1f3b7a9 GIT binary patch literal 2898 zcmV-Y3$650Nk&FW3jhFDMM6+kP&il$0000G0000-002h-06|PpNWB9900E$G+qN-D z+81ABX7q?;bwx%xBg?kcwr$(C-Tex-ZCkHUw(Y9#+`E5-zuONG5fgw~E2WDng@Bc@ z24xy+R1n%~6xI#u9vJ8zREI)sb<&Il(016}Z~V1n^PU3-_H17A*Bf^o)&{_uBv}Py zulRfeE8g(g6HFhk_?o_;0@tz?1I+l+Y#Q*;RVC?(ud`_cU-~n|AX-b`JHrOIqn(-t&rOg-o`#C zh0LPxmbOAEb;zHTu!R3LDh1QO zZTf-|lJNUxi-PpcbRjw3n~n-pG;$+dIF6eqM5+L();B2O2tQ~|p{PlpNcvDbd1l%c zLtXn%lu(3!aNK!V#+HNn_D3lp z2%l+hK-nsj|Bi9;V*WIcQRTt5j90A<=am+cc`J zTYIN|PsYAhJ|=&h*4wI4ebv-C=Be#u>}%m;a{IGmJDU`0snWS&$9zdrT(z8#{OZ_Y zxwJx!ZClUi%YJjD6Xz@OP8{ieyJB=tn?>zaI-4JN;rr`JQbb%y5h2O-?_V@7pG_+y z(lqAsqYr!NyVb0C^|uclHaeecG)Sz;WV?rtoqOdAAN{j%?Uo%owya(F&qps@Id|Of zo@~Y-(YmfB+chv^%*3g4k3R0WqvuYUIA+8^SGJ{2Bl$X&X&v02>+0$4?di(34{pt* zG=f#yMs@Y|b&=HyH3k4yP&goF2LJ#tBLJNNDo6lG06r}ghC-pC4Q*=x3;|+W04zte zAl>l4kzUBQFYF(E`KJy?ZXd1tnfbH+Z~SMmA21KokJNs#eqcXWKUIC>{TuoKe^vhF z);H)o`t9j~`$h1D`#bxe@E`oE`cM9w(@)5Bp8BNukIwM>wZHfd0S;5bcXA*5KT3bj zc&_~`&{z7u{Et!Z_k78H75gXf4g8<_ul!H$eVspPeU3j&&Au=2R*Zp#M9$9s;fqwgzfiX=E_?BwVcfx3tG9Q-+<5fw z%Hs64z)@Q*%s3_Xd5>S4dg$s>@rN^ixeVj*tqu3ZV)biDcFf&l?lGwsa zWj3rvK}?43c{IruV2L`hUU0t^MemAn3U~x3$4mFDxj=Byowu^Q+#wKRPrWywLjIAp z9*n}eQ9-gZmnd9Y0WHtwi2sn6n~?i#n9VN1B*074_VbZZ=WrpkMYr{RsI ztM_8X1)J*DZejxkjOTRJ&a*lrvMKBQURNP#K)a5wIitfu(CFYV4FT?LUB$jVwJSZz zNBFTWg->Yk0j&h3e*a5>B=-xM7dE`IuOQna!u$OoxLlE;WdrNlN)1 z7**de7-hZ!(%_ZllHBLg`Ir#|t>2$*xVOZ-ADZKTN?{(NUeLU9GbuG-+Axf*AZ-P1 z0ZZ*fx+ck4{XtFsbcc%GRStht@q!m*ImssGwuK+P@%gEK!f5dHymg<9nSCXsB6 zQ*{<`%^bxB($Z@5286^-A(tR;r+p7B%^%$N5h%lb*Vlz-?DL9x;!j<5>~kmXP$E}m zQV|7uv4SwFs0jUervsxVUm>&9Y3DBIzc1XW|CUZrUdb<&{@D5yuLe%Xniw^x&{A2s z0q1+owDSfc3Gs?ht;3jw49c#mmrViUfX-yvc_B*wY|Lo7; zGh!t2R#BHx{1wFXReX*~`NS-LpSX z#TV*miO^~B9PF%O0huw!1Zv>^d0G3$^8dsC6VI!$oKDKiXdJt{mGkyA`+Gwd4D-^1qtNTUK)`N*=NTG-6}=5k6suNfdLt*dt8D| z%H#$k)z#ZRcf|zDWB|pn<3+7Nz>?WW9WdkO5(a^m+D4WRJ9{wc>Y}IN)2Kbgn;_O? zGqdr&9~|$Y0tP=N(k7^Eu;iO*w+f%W`20BNo)=Xa@M_)+o$4LXJyiw{F?a633SC{B zl~9FH%?^Rm*LVz`lkULs)%idDX^O)SxQol(3jDRyBVR!7d`;ar+D7do)jQ}m`g$TevUD5@?*P8)voa?kEe@_hl{_h8j&5eB-5FrYW&*FHVt$ z$kRF9Nstj%KRzpjdd_9wO=4zO8ritN*NPk_9avYrsF(!4))tm{Ga#OY z(r{0buexOzu7+rw8E08Gxd`LTOID{*AC1m*6Nw@osfB%0oBF5sf<~wH1kL;sd zo)k6^VyRFU`)dt*iX^9&QtWbo6yE8XXH?`ztvpiOLgI3R+=MOBQ9=rMVgi<*CU%+d1PQQ0a1U=&b0vkF207%xU0ssI2 literal 0 HcmV?d00001 diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..4f0f1d64e58ba64d180ce43ee13bf9a17835fbca GIT binary patch literal 982 zcmV;{11bDcNk&G_0{{S5MM6+kP&il$0000G0000l001ul06|PpNU8t;00Dqo+t#w^ z^1csucXz7-Qrhzl9HuHB%l>&>1tG2^vb*E&k^T3$FG1eQZ51g$uv4V+kI`0<^1Z@N zk?Jjh$olyC%l>)Xq;7!>{iBj&BjJ`P&$fsCfpve_epJOBkTF?nu-B7D!hO=2ZR}

C%4 zc_9eOXvPbC4kzU8YowIA8cW~Uv|eB&yYwAObSwL2vY~UYI7NXPvf3b+c^?wcs~_t{ ze_m66-0)^{JdOMKPwjpQ@Sna!*?$wTZ~su*tNv7o!gXT!GRgivP}ec?5>l1!7<(rT zds|8x(qGc673zrvYIz;J23FG{9nHMnAuP}NpAED^laz3mAN1sy+NXK)!6v1FxQ;lh zOBLA>$~P3r4b*NcqR;y6pwyhZ3_PiDb|%n1gGjl3ZU}ujInlP{eks-#oA6>rh&g+!f`hv#_%JrgYPu z(U^&XLW^QX7F9Z*SRPpQl{B%x)_AMp^}_v~?j7 zapvHMKxSf*Mtyx8I}-<*UGn3)oHd(nn=)BZ`d$lDBwq_GL($_TPaS{UeevT(AJ`p0 z9%+hQb6z)U9qjbuXjg|dExCLjpS8$VKQ55VsIC%@{N5t{NsW)=hNGI`J=x97_kbz@ E0Of=7!TQj4N+cqN`nQhxvX7dAV-`K|Ub$-q+H-5I?Tx0g9jWxd@A|?POE8`3b8fO$T))xP* z(X?&brZw({`)WU&rdAs1iTa0x6F@PIxJ&&L|dpySV!ID|iUhjCcKz(@mE z!x@~W#3H<)4Ae(4eQJRk`Iz3<1)6^m)0b_4_TRZ+cz#eD3f8V;2r-1fE!F}W zEi0MEkTTx}8i1{`l_6vo0(Vuh0HD$I4SjZ=?^?k82R51bC)2D_{y8mi_?X^=U?2|F{Vr7s!k(AZC$O#ZMyavHhlQ7 zUR~QXuH~#o#>(b$u4?s~HLF*3IcF7023AlwAYudn0FV~|odGH^05AYPEfR)8p`i{n zwg3zPVp{+wOsxKc>)(pMupKF!Y2HoUqQ3|Yu|8lwR=?5zZuhG6J?H`bSNk_wPoM{u zSL{c@pY7+c2kck>`^q1^^gR0QB7Y?KUD{vz-uVX~;V-rW)PDcI)$_UjgVV?S?=oLR zf4}zz{#*R_{LkiJ#0RdQLNC^2Vp%JPEUvG9ra2BVZ92(p9h7Ka@!yf9(lj#}>+|u* z;^_?KWdzkM`6gqPo9;;r6&JEa)}R3X{(CWv?NvgLeOTq$cZXqf7|sPImi-7cS8DCN zGf;DVt3Am`>hH3{4-WzH43Ftx)SofNe^-#|0HdCo<+8Qs!}TZP{HH8~z5n`ExcHuT zDL1m&|DVpIy=xsLO>8k92HcmfSKhflQ0H~9=^-{#!I1g(;+44xw~=* zxvNz35vfsQE)@)Zsp*6_GjYD};Squ83<_?^SbALb{a`j<0Gn%6JY!zhp=Fg}Ga2|8 z52e1WU%^L1}15Ex0fF$e@eCT(()_P zvV?CA%#Sy08_U6VPt4EtmVQraWJX` zh=N|WQ>LgrvF~R&qOfB$!%D3cGv?;Xh_z$z7k&s4N)$WYf*k=|*jCEkO19{h_(%W4 zPuOqbCw`SeAX*R}UUsbVsgtuG?xs(#Ikx9`JZoQFz0n*7ZG@Fv@kZk`gzO$HoA9kN z8U5{-yY zvV{`&WKU2$mZeoBmiJrEdzUZAv1sRxpePdg1)F*X^Y)zp^Y*R;;z~vOv-z&)&G)JQ{m!C9cmziu1^nHA z`#`0c>@PnQ9CJKgC5NjJD8HM3|KC(g5nnCq$n0Gsu_DXk36@ql%npEye|?%RmG)

FJ$wK}0tWNB{uH;AM~i literal 0 HcmV?d00001 diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..948a3070fe34c611c42c0d3ad3013a0dce358be0 GIT binary patch literal 1900 zcmV-y2b1_xNk&Fw2LJ$9MM6+kP&il$0000G0001A003VA06|PpNH75a00DqwTbm-~ zullQTcXxO9ki!OCRx^i?oR|n!<8G0=kI^!JSjFi-LL*`V;ET0H2IXfU0*i>o6o6Gy zRq6Ap5(_{XLdXcL-MzlN`ugSdZY_`jXhcENAu)N_0?GhF))9R;E`!bo9p?g?SRgw_ zEXHhFG$0{qYOqhdX<(wE4N@es3VIo$%il%6xP9gjiBri+2pI6aY4 zJbgh-Ud|V%3O!IcHKQx1FQH(_*TK;1>FQWbt^$K1zNn^cczkBs=QHCYZ8b&l!UV{K z{L0$KCf_&KR^}&2Fe|L&?1I7~pBENnCtCuH3sjcx6$c zwqkNkru);ie``q+_QI;IYLD9OV0ZxkuyBz|5<$1BH|vtey$> z5oto4=l-R-Aaq`Dk0}o9N0VrkqW_#;!u{!bJLDq%0092{Ghe=F;(kn} z+sQ@1=UlX30+2nWjkL$B^b!H2^QYO@iFc0{(-~yXj2TWz?VG{v`Jg zg}WyYnwGgn>{HFaG7E~pt=)sOO}*yd(UU-D(E&x{xKEl6OcU?pl)K%#U$dn1mDF19 zSw@l8G!GNFB3c3VVK0?uyqN&utT-D5%NM4g-3@Sii9tSXKtwce~uF zS&Jn746EW^wV~8zdQ1XC28~kXu8+Yo9p!<8h&(Q({J*4DBglPdpe4M_mD8AguZFn~ ztiuO~{6Bx?SfO~_ZV(GIboeR9~hAym{{fV|VM=77MxDrbW6`ujX z<3HF(>Zr;#*uCvC*bpoSr~C$h?_%nXps@A)=l_;({Fo#6Y1+Zv`!T5HB+)#^-Ud_; zBwftPN=d8Vx)*O1Mj+0oO=mZ+NVH*ptNDC-&zZ7Hwho6UQ#l-yNvc0Cm+2$$6YUk2D2t#vdZX-u3>-Be1u9gtTBiMB^xwWQ_rgvGpZ6(C@e23c!^K=>ai-Rqu zhqT`ZQof;9Bu!AD(i^PCbYV%yha9zuoKMp`U^z;3!+&d@Hud&_iy!O-$b9ZLcSRh? z)R|826w}TU!J#X6P%@Zh=La$I6zXa#h!B;{qfug}O%z@K{EZECu6zl)7CiNi%xti0 zB{OKfAj83~iJvmpTU|&q1^?^cIMn2RQ?jeSB95l}{DrEPTW{_gmU_pqTc)h@4T>~& zluq3)GM=xa(#^VU5}@FNqpc$?#SbVsX!~RH*5p0p@w z;~v{QMX0^bFT1!cXGM8K9FP+=9~-d~#TK#ZE{4umGT=;dfvWi?rYj;^l_Zxywze`W z^Cr{55U@*BalS}K%Czii_80e0#0#Zkhlij4-~I@}`-JFJ7$5{>LnoJSs??J8kWVl6|8A}RCGAu9^rAsfCE=2}tHwl93t0C?#+jMpvr7O3`2=tr{Hg$=HlnjVG^ewm|Js0J*kfPa6*GhtB>`fN!m#9J(sU!?(OSfzY*zS(FJ<-Vb zfAIg+`U)YaXv#sY(c--|X zEB+TVyZ%Ie4L$gi#Fc++`h6%vzsS$pjz9aLt+ZL(g;n$Dzy5=m=_TV(3H8^C{r0xd zp#a%}ht55dOq?yhwYPrtp-m1xXp;4X;)NhxxUpgP%XTLmO zcjaFva^}dP3$&sfFTIR_jC=2pHh9kpI@2(6V*GQo7Ws)`j)hd+tr@P~gR*2gO@+1? zG<`_tB+LJuF|SZ9tIec;h%}}6WClT`L>HSW?E{Hp1h^+mlbf_$9zA>!ug>NALJsO{ mU%z=YwVD?}XMya)Bp;vlyE5&E_6!fzx9pwrdz474!~g(M6R?N? literal 0 HcmV?d00001 diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 0000000000000000000000000000000000000000..1b9a6956b3acdc11f40ce2bb3f6efbd845cc243f GIT binary patch literal 3918 zcmV-U53%r4Nk&FS4*&pHMM6+kP&il$0000G0001A003VA06|PpNSy@$00HoY|G(*G z+qV7x14$dSO^Re!iqt-AAIE9iwr$(CZQJL$blA4B`>;C3fBY6Q8_YSjb2%a=fc}4E zrSzssacq<^nmW|Rs93PJni30R<8w<(bK_$LO4L?!_OxLl$}K$MUEllnMK|rg=f3;y z*?;3j|Nh>)p0JQ3A~rf(MibH2r+)3cyV1qF&;8m{w-S*y+0mM){KTK^M5}ksc`qX3 zy>rf^b>~l>SSHds8(I@hz3&PD@LmEs4&prkT=BjsBCXTMhN$_)+kvnl0bLKW5rEsj z*d#KXGDB4P&>etx0X+`R19yC=LS)j!mgs5M0L~+o-T~Jl!p!AJxnGAhV%~rhYUL4hlWhgES3Kb5oA&X z{}?3OBSS-{!v$nCIGj->(-TAG)8LR{htr41^gxsT8yqt2@DEG6Yl`Uma3Nd4;YUoW zTbkYl3CMU5ypMF3EIkYmWL|*BknM`0+Kq6CpvO(y$#j94e+q{vI{Zp8cV_6RK!`&C zob$*5Q|$IZ09dW=L!V zw@#2wviu|<#3lgGE8GEhcx+zBt`} zOwP8j9X%^f7i_bth4PiJ$LYtFJSCN$3xwDN;8mr*B;CJwBP2G0TMq0uNt7S^DO_wE zepk!Wrn#Z#03j{`c*Rf~y3o7?J}w?tEELRUR2cgxB*Y{LzA#pxHgf}q?u5idu>077 zd^=p)`nA}6e`|@`p?u}YU66PP_MA}Zqqe!c{nK&z%Jwq1N4e_q<#4g^xaz=ao;u|6 zwpRcW2Lax=ZGbx=Q*HhlJ`Ns#Y*r0*%!T?P*TTiX;rb)$CGLz=rSUum$)3Qyv{BL2 zO*=OI2|%(Yz~`pNEOnLp>+?T@glq-DujlIp?hdJeZ7ctP4_OKx|5@EOps3rr(pWzg zK4d3&oN-X2qN(d_MkfwB4I)_)!I_6nj2iA9u^pQ{;GckGLxBGrJUM2Wdda!k)Y>lq zmjws>dVQ*vW9lvEMkiN3wE-__6OWD0txS&Qn0n22cyj4Q*8(nG4!G{6OOwNvsrPIL zCl-$W9UwkEUVuLwyD%|inbOF*xMODZ4VMEVAq_zUxZ+K#Gdqf!DW$5f)?7UNOFMz! zrB~tuu=6X2FE(p^iqgxr+?ZK;=yz`e;C$#_@D9Lj-+TDVOrva>(#*PVbaHO>A)mhl z07OJWCqYC60518$!&c`eNBcBW%GnfaQ*$eazV^2_AW?j)h;J1nUjN(I9=0+!RVx~% z3@Tf!P0TE+98jA?WceK-}A1% zW!K)lyKcGqy#M~})315-A#2NXQ`?6NR#Apo=S!oF=JfpX>iR*49ec{7AN$xxpK{D$ z2d%Fz&rdfSqourN$~Y^NFIMV1CZ?J*bMx~H3k&meGtH@q9ra2vZxmA$S(#jaaj-g4 ztJmxG+DLV<*q<|sDXPp$X>E)#S}Vm&sRaO5P&goh2><}FEdZSXDqsL$06sAkh(e+v zAsBhKSRexgwg6tIy~GFJzaTxXD(}|+0eOwFDA%rn`X;MVwDHT9=4=g%OaJ9s%3b9>9EUTnnp0t;2Zpa{*>mk~hZqItE_!dQ zOtC>8`$l|mV43Jbudf0N6&&X;{=z}Zi}d1`2qmJ}i|0*GsulD3>GgQXHN)pkR6sf1 z?5ZU%&xtL}oH;YiAA)d*^Ndw2T$+Mjuzyzz@-SM`9df7LqTxLuIwC~S0092~+=qYv z@*ja;?Wt!T!{U?c*Z0YtGe)XbI&y-?B&G2$`JDM)(dIV9G`Sc#6?sI60de6kv+)Qb zUW~2|WjvJq3TA8`0+sWA3zRhY9a~ow)O~&StBkG2{*{TGiY~S8ep{V&Vo2l<6LWsu z^#p0-v*t2?3&aA1)ozu|%efSR=XnpX$lvTeRdKlvM!@|pM5p2w3u-6 zU>}t2xiYLS+{|%C65AzX+23Mtlq?BS&YdYcYsVjoiE&rT>;Necn6l^K)T^lmE`5u{ zm1i+-a-gc;Z&v-{;8r)z6NYfBUv+=_L}ef}qa9FX01)+Aaf+;xj(mL6|JUzGJR1|fnanb%?BPPIp>SCjP|8qE5qJ{=n5ZGw?81z3(k;pzH%1CtlX50{E7h)$h{qGKfzC`e2o`*IqA#tjA z`Fz&^%$b9F*N`)U-#6>a)Z`55`$Dd0cfcs0$d13^ONrdCu9xcv_=n#WQo8stcz3jP9|2EvdI-RhJM3%Q%oM&!OlShM|0 z?gz?wHZSnm45njLtsz8PVT1S&jAlbKg5kVam$p16=EK@Sj4EP0OtH zmJDmdc^v)x>56Qg_wmYHz6h)>kl_h$>0@J!ypv%APmjZTAQVLy6Fu50RGY&JAVNhx zrF_qG6`x9MkT;1SFWo$)l{M$;3qUDn9JwE}z zRl#E_bDRJFii61kPgBybIgp8dNW!Cc1b*^YYk-#oWLJvtM_v^hQx~9?8LD4VFFxBF z3MlrsSC%f9Oupn*ctPL0U1fwfX?`tRhPD{PSLFPQOmIt$mDy0SgpNVvHS+f#Do>h1Gn?LZU9(KaN>Q_=Y*_T zvtD7%_u^^+{g`0VGzg(VZrpVQ6Ub5M=tI_p7T93R8@3Zulu3|#{iNcu!oiHxZ4Rf*( zfmiN$$ru(*_Zqn=`Gq#OuHRTSwp7uH_SokR&|)RuW5yo=Z|_4?qU-JU+tpt>!B&Is z@N(=SG;bpVc;AO@zbmMM zScqq1)b-ZQIrs={oD}|?6y{$HNB1U0^LsBh8JI&3!GBZxOXI<}&5-$lgkAaYqhOTb z?2vEnZ$-kk;*M_17(upJF3%+iH*s0-r{vttXVB2OUwI1s^+G(Ft(U8gYFXC}#P&E^ z>T@C^tS`Z7{6HT4_nF~n>JlZtk5&qDBl6r|^kzQYe`wq!C)n@$c>WOPA61NDFj<<6 zGW71NMMhwAl!U-yqrq2xrSFqRCI8acw7?}3j;ynxo*-b7Co;g5r%^j=H@9({PXXBf z@r>U>>N;E)81wx`B4f%{PB~MHka_);%kBCb(d|Jy5!MqJ%2p`t&@L)4$T2j&-WHvG zv3(uyA_gwqNu(k?jQTtv3dgPKRZoH8prxe7>pQBW5L&dpumS&5Ld2?(sCpJjvc4L5 zEnh&?91WVm)ZdTj=fjJ$pPDdgAttLXuke+?KdKxu*;kTC(r!tQk6;gxj4h%FdHAt(^M3YvYj(!tOeN)+Hvj6+< zzyJRG?^lZfWuR#t!tUKP&(?%3v&Zd$R2YN>lB(Lq`OInY48%4%yTv2 zYe1{G`3)(PDEio5Y@-I5tUf`c%%OCJMtSW56g3iEg%3`$7XSJJHyA z<|7&N)5Xrlgv~%BO24eFd;Hd;uiK%D`EdK|quUeRZDqbh9l)%j%J#0lfrZumvA<_w zu&=AVvdChf6}eqh(bUz`(`Ue*p01{fBAcTgKyDYLs_I+YyJEk+rM@avU~>fB$n)HS zM7pfJydu`i%gfS<{PF94kZDv$t>06sAkheDzu40NJ$5CMW%n^Lls?8^p^QGWURbKu3ZduZQZ((s2? zzE`}<{;Zt7<$C|9R8A~DJ~@%x>TfP zF>TX8)@v|t)q4GjRt<}5s6hLHwRel7>V@&r-O|Av(yh;Q1A{E>Ir>p+%dHD|=l+lT zpr(Dg&>#Nu=!)6bCLr-ZS%|;h)Ij$+e@r8_{qO19QvDe=&1tmpY*0lcA^Cc-#{9fQ z<~$*<&P$Q<_jy#<$40PMofM7aQ}C=jphI`4kLg}Z7CIN#26D{-4v-_CA-LiE@(%{y!BzsU%gG`Q?sjLUf%qFSl0y)2#ae*+EI>s|i`d^V$Dn)qmzqRq6VJRY|{4ujsIU%#bnqU6MR&-1I_43=|5(6Jr;Jvert) zE?S|Tmn}Tv<-??sxV5@9t}3D=>YZ0JrQe$CO~|EY=Lj9RM&4svQHPQL6%pV5fPFiH zfXDx;l@~et{*{U*#c#Dvzu)|znDO7$#CRx)Z&yp-}SrD{&|(MQtfUz~n35@RLfUy=aqrhCX0M}J_r5QsK~NmRCR|Nm&L z41UdsLjWxSUlL41r^0K&nCCK>fdR-!MYjFg(z9_mF^C|#ZQw?`)f6uVzF^`bRnVY& zo}@M06J&_+>w9@jpaO4snmU;0t-(zYW1qVBHtuD!d?%?AtN7Plp><-1Y8Rqb20ZaP zTCgn*-Sri4Q8Xn>=gNaWQ57%!D35UkA@ksOlPB*Dvw}t02ENAqw|kFhn%ZyyW%+t{ zNdM!uqEM^;2}f+tECHbwLmH*!nZVrb$-az%t50Y2pg(HqhvY-^-lb}>^6l{$jOI6} zo_kBzj%8aX|6H5M0Y<)7pzz_wLkIpRm!;PzY)9+24wk2&TT{w--phDGDCOz{cN_ca zpnm7`$oDy=HX%0i-`769*0M6(e5j-?(?24%)<)&46y0e&6@HCDZAm9W6Ib#Y#BF6- z=30crHGg+RRTe%VBC>T00OV6F+gQDAK38Ne3N9bm|62tPccBJi)5{B z4zc^Db72XiBd}v$CF|yU{Z=M|DZ%-(XarYNclODlb1Kz1_EKLy(NSLCN`eUl(rBCL zT*jx@wNvze0|TSqgE(QArOZU)_?qH(sj#TwzElLs9q)(0u!_P|R%Cy_0JFQxgGV>1 zz4?_uq<8_gM0`c*Hh|;UMz~vrg1gQXp{ufg`hM_qU;U>+zmvc5blCLSq@PrEBSGR# z&8=2Z4uXN`F3p73ueD1l{s{k$WipAvSh5W7ABe?4)t;r@V?y`bNB5FvBuE|0VRTb< zM1Hn^?DSsJY+sX@T5xW=#>T9VEV|?<(=6|ge$X6Sb05!LFdjDcoq*gM(Zq=t;_)Le&jyt(&9jzR73noru`a# zN*<`KwGa^gZU3-)MSLF0aFag#f0<>E(bYTeHmtdbns#|I)-$)mJ`q9ctQ8g0=ET?| zdO}eZ*b_p>ygRTtR^5Ggdam=Zb5wmd{}np+Jn1d_=M`~P=M67jj})fH4ztb5yQqQW z^C|C&^LHAK-u+ooIK)yM)QM?t;|<{P;;{`p=BclzAN#JzL4jCwXkQB1Dy{=^KR`=~ zTrr)y7eiYBzSNs_DvO=4A6#EgGS-zY%Vi)N*Yb`U;6o}KR}dq{r9pT5wqZ@3NOE8- z9-(}D|Nc5732CSYQbL)!gPQ#RbD8BhK3dl{sUuPvei0tkvnJBxDEAYTesU8H$)g(Plra{VH(v3u^CO1~(+ zU0O7#)jaS4{NcwA+LuSm&VBcX2#Im3xg)W}ySNw%->orn1taZ&+d)}8gJTqA!u|5P z{yv?zol_3|(1(%M(EVU=cp?L`{Pi|ixk{U)*guFML3P!OSlz;zGA#T+E@8@cgQ_mv1o7RSU=Zo_82F?&&2r;WE z@wk}JHYEZ9nYUc(Vv~iTCa3u8e4q(yq<29VoNbKk|`mq%I6u)My=gPIDuUb&lzf4`MEA9^g8u z)vp8|$$HE9m_BTV?lOosIGa4jud=jIbw)O2eCMfyw2*S8?hjWw^nqws$O*M$3I1)x zR0PWFb3$ySOcGTe1dz%N0l;RPc`x%05FtT^f^j{YCP}*Q=lvp4$ZXrTZQHhO+w%wJn3c8j%+5C3UAFD&%8dBl_qi9D5g8fry}6Ev z2_Q~)5^N$!IU`BPh1O|=BxQ#*C5*}`lluC515$lxc-vNC)IgW=K|=z7o%cWFpndn= zX}f{`!VK02_kU+Q5a3m37J;c} zTzbxteE{GNf?yLt5X=Bzc-mio^Up0nunMCgp*ZJ;%MJvPM3QK)BryP(_v@ei4UvHr z6+sbCifQaOkL6-;5fL8$W($zZ_;CZp305C;~$hhRquZr-r)jjd1z z31%ZK{-(`P#|Um_Sivn@p$-vz46uqT>QG0B1w9znfS9A8PB2LaHdzA|_)yjXVR*l{ zkcu3@vEf7bxH0nkh`q?8FmoO_Ucui*>_a~P?qQrlZ9@+D7%MTpSnztpylXrt5!-k8_QPB?YL8Kx_On8WD zgT+111d(Op$^$&KLAN5+@?>f7F4~wFi(8TL8+szgVmcMDTp5l&k6~=rA{Dt}!gb^r zSWY<)M7D|Z2P0cEodj6E42PV>&>DFmQpgt)E-|#sSUU@uKed+F680H@<;-x{p|nuH4!_mn85rx>wz;0mPi2ZkL#k6;sznu?cXh!T0S>{w6 zL^gvR05NY64l*<+_L>On$rjx9!US;l;LX6@z}yi#2XHh)F@Oo+l)h%fq$v}DNmF2> zfs^_t0)3N-W<9-N?uedVv{)-J0W5mh#29QM5R5h&KuiRM=0Zvnf#lF=K#WlCgc#9c zS;qvh(P$!_a8JwyhI^ZJV2k+B6Z^64?w|1?5gyo6y{}923CRZfYVe1#?F% z7h2SUiNO3;T#JUOyovSs@@C1GtwipycA=*x5{BpIZ_#GCMuV8XK=x;qCNy{d7?wA~ zC+=vjls;ci&zW=6$H~4^K%v{p}Ab?U%C6Z4p%eC<3ExqU$XR<}LLF67A$Sr20DR_pJ3yeBa~ z^sw{V0FI5;UpwXsScYuhbqGQ`YQ25;6p6W^+tgL&;Ml;>S3CGpSZ>VrTn0m1$y$HU z&65)I!c?oREz};c=nLCliriqQX->4uivHTgd${GqeAlf*!P^B|jkU|*IdNP(&6C>4 zqOW$)Nw9nvjy^&`?E|gotDV{JmJ9Q~vuhy<`^C4XIUDt|j4o6rK^e8_(=YqC zuaR6TRVf@tUFHB079o4MBIh{M~4>WwnGgesQH*3?w(RA%hCZ*7)b!aNV=yOQ%o_Y=Lt0Sl*(9^jfRnC210Om$=y>*o|3z} zAR&vAdrB#mWoaB0fJSw9xw|Am$fzK>rx-~R#7IFSAwdu_EI|SRfB*yl0w8oX09H^q zAjl2?0I)v*odGJ40FVGaF&2qJq9Gv`>V>2r0|c`GX8h>CX8eHcOy>S0@<;M3<_6UM z7yCEpug5NZL!H_0>Hg_HasQGxR`rY&Z{geOy?N92Z z{lER^um|$*?*G63*njwc(R?NT)Bei*3jVzR>FWUDb^gKhtL4A=kE_1p-%Fo2`!8M} z(0AjuCiS;G{?*^1tB-uY%=)SRx&D)pK4u@>f6@KPe3}2j_har$>HqzH;UCR^ssFD0 z7h+VLO4o@_Yt>>AeaZKUxqyvxWCAjKB>qjQ30UA)#w z&=RmdwlT`7a8J8Yae=7*c8XL|{@%wA8uvCqfsNX^?UZsS>wX}QD{K}ad4y~iO*p%4 z_cS{u7Ek%?WV6em2(U9#d8(&JDirb^u~7wK4+xP$iiI6IlD|a&S)6o=kG;59N|>K1 zn(0mUqbG3YIY7dQd+*4~)`!S9m7H6HP6YcKHhBc#b%1L}VIisp%;TckEkcu0>lo@u995$<*Em;XNodjTiCdC%R+TX|_ZR#|1`RR|`^@Teh zl#w@8fI1FTx2Dy+{blUT{`^kY*V-AZUd?ZZqCS4gW(kY5?retkLbF=>p=59Nl|=sf zo1Pc|{{N4>5nt#627ylGF`3n>X%`w%bw-Y~zWM_{Si$dc82|=YhISal{N7OY?O`C4 zD|qb}6nLWJ`hUyL+E>-;ricg9J@ZNYP(x(Sct&OI$Y!QWr*=^VN;G3#i>^1n4e#Je zOVhbFbLpXVu*16enDM+ic;97@R~u&kh__kgP#!R`*rQEnA+_dLkNP~L`0alC|J;c; zeiK=s8;BsLE)KbG3BD&Br@(Ha@SBT&$?xX`=$;eeel=|R_dIr6-Ro?=HEjnsJ_b`1 zK6Yg^-6;^2aW!xeTK)A~3Rm|L^FCHB_I>jIju7ZGo&N_1*QHkxH2!!%@o4iZ?vntS;&zJdPe1dH#04YD93A44o-MpfD zP{rn_aq>U%RDvC2+bp;xPlsOzauIi3*Lf42`jVKKZCRuKdYhi>FDuL2l=v{$BCN#Q6796s%r-AG$Q^t(3c@ zD?w0UhYr11@feiyl9kY_@H8~|xlmO<8PfQmj1!$@WieW@VxR@Psxfe-v9WCi1+f>F4VL?0O~K7T?m4-u|pSkBpUJZZe*16_wAp zSYZ@;k`3;W3UHKUWc8QeI}0jH5Ly=cGWQPw(Kr2fm=-5L(d`lcXofy8tJY3@Tuadz zYWXR{mW7XT!RF#RVCe%}=tM*O6!AD3^(!8un~opNI%Uko7$5t@<8+?; zTxDys(MyyGsUjtSu9$+|_-t!U3fVb1dkK?l`17<+jfl=hrBHnDSV>^R1=TnQeyqbW z>ov#l%!1|S!1>8UUxIdhQq`_klcHVx0{?#>K3#$4GlXncwldt!g17TcvKq-jo_996 z>oA=tH9CqRl6Yw?Uc`am!V?lHJbizOJaVaScf1UP5e7Dbgabq=b!B~T&_F6?ooU>w%x0A zH~&MHJ=q`fCH{U<7MDXE4SD32cDZA)WJeWkllJ`UspWaS#eDe^kg^oU_A14UE9zG-a^g{xaXf$})Wik>gT zl#dkzGr(;h0JZDuFn(+k8wNq?PZ5grQ<+sM?wBGt@JnH6v0#or-5wBQWKU~(S_> zkE!tc*ZJ1Y&*p(xX84POb3cClRMd!^qJ#CAZfIepEj-<`VURS_yCz0(?*Ixcj4 z-!zV1_QZhpm=0<;*(nm+F>T=)o?ep@CK5I%g^VAA+RB25ab?7)A~z~egru=I1S|@v zH7tXV!0wmGS^qj#e+MY;C5eUjEAp$Y?LDkS^QPZ}8WN85?r$u<-Epi;yZ1|J2J`se z$D6DpH~2F=eI0B&=UFAUnJvZAmClJlK)sutJ?M>xpZiWV&0=G4MZP+x+p>EX=HbCz zxls%Mw?*u^;LbHWIWCyq+yi)`GmFn9J112CZda_u@YIP%i;srFg_paU02Ifij*7}l z&CF-(3|>*a|+vbNR`^RP=9G?ymEJ0Z~)d&c*UE$UMepZ zcITr{0WqhxkjUnM15js_gW=e3Uh|y6ZReaXHIz-=p`x5VvB&rH9y>Amv@^WmXFEw) zQXYrk3feir=a{jMQ+wDIkkFnZ$k{sJakHn*?u za%4b!00ev8NVLM1TY=cl?KB&55BY_MU-sg?c>=Dbz_W{(Z~c?HJi*XpYL)C6Bd8WH zt+v-#0&o~@t4qESi*)+eW%@VD0|o^yF)n0hME$UtXF$*Lvh}7sso{`|pn*JDIy5^Fm3s$5*zEE=?u5<=l8FJc3r%+H} zdfoNl2J0^~!-*mOL5o-x32|e0Im*E!yY7F7E5N)W3>+v_LBydlEx?4$RL5f2oYRD# zaR0wv(-p~wO0eLDl3K=%`{5+0Gd$ktO=W)gWlGZJ0`K z$_RNA=ckrfa;H0KA~dR^p�(p-{x$&=IACIfoAR!za)F-^da-t3#0Dycnp zwO~NVXwXCl;jE<}>%@xz|=8fIJAB?>+E{7)|4l${4ngA3G|=r z2Dyv;VVWSgZx9Wj>qUjleGl3Ei9K4>h!(lPS%8VOG>Xu0%6VDz^O=bjJmuP7>DeUv zrbI}MlHB^^d?{zv6d=@_ZD2lg1&G7UjnVN{1}9WkaM3H~btX0GtSzB+tZ^qRgWo4m z!GmimlG$=wgXCnr6j@m<1gAL46#T~5Bnm=2{^@>|t&`9mkEPddj zAvG~@Tv~TAm2i%VW}R-g(Z0)z-Y|szHr@rk>4MAyG*Ma*7Yh#H7(!-5>DZ@8r;_dx z{prSe<>~099F8vsYd2xff7uAS%7{S)f(|@me3t2$iy&NEc7OUEchp@9A|X;;IA>8!oX+y(BKJ$EzV* znR$z;!L$s7uy@{OT~nG#B!NRraT8(X##Ho!0r_o@gg0CA-9H^;-uE&?$2$nHv_00o z%cbuUc-tCx$Uh&EZ4Nf4Zgqv)Y6>usG3>GeQnxx_Z6+PcbX-+ysbt1hQ`K1LDpOE? zrAhIZhSN9yVIAOa22gn577tbc&i3|3V8NWy&!tw##`}9*x}gtI^h1DzZRA>UuaJG) zaZ7j)dq!O}{?#8Y7~7i6fHh4{`pL?>-18|p!S75Y#^DM>-S3)vuZG+Q7l@ek zQP~#cBpWgg#mApc_sPYjpw8odQuRokmTkzcNl`^CcKB7e&;zViV;{Y{o^Y$%7i0m# z62%#1Lq!RC?}lK>%mp}T!3Xv;L*0v*>USLm``N%>w>@fwC+#T&Tx2bN4w(20JB}oU zuSa6v^kXi0xPs?pbaOHnyiqq6By1EZY9OZ^^QA>{q-Hsd&m`pbQ%8121aWG-F5xf zlZ%;B{;C>X19|`^_?dVyCq>n+41w7|!tUS!{9rHlbhX=SZO5CQ^;!Du_E7*`GiR^Q w)2!4MKjfSAeNo!9>IaV6aUZ*?W>} zs4%E?srLW`CJh0GCIK@hTkrW7A15Iu%N&?Q^$0+!{Tv&|t^Y@u%!L zglTg&?Q5q#ijZ;&HBQ?FNPp;k3J5!&{^+SGq?AX~SiOM9jJMRpyP?RCr@z38AQyy&WRMaC;n4una$~nJKSp?q|s8F00c9?Q! zY_ovvjTFm+DeQM^LXJ#v0}6HRt3R1%5PT*}W!k8BEM;Jrj8dIceFo2fhzTqaB3KKk zGlCLI)gU25(#u6ch6GeB1k@eHq7l{EHXv0n6xE#ws#ri}08kkCf8hUt{|Ejb`2YW* zvg}0nSSX1m=76s?sZhRY$K=3dpJ+y*eDULGnL2}4>4nvW^7_<~wIM_5fjvwt4h1|g z)g0Z6ZFq9j<~9~b8((~TN{Z?ZQfw|is&Xp~AC61sj;xItKyCHdI|tCMC_LbXF>~vR z=w6V3^H=W4CbAgR4#xw}ETTwu2guW~=Crl@SMXv85jQ=%y!s^?m4PI0My7MWICO;- z175jm%&PcPWh8QdOU(#8bp4!N7ET-+)N}N2zk2)8ch|4Q&lPFNQgT-thu053`r*h3 z_8dI@G;`zn;lH$zX3RzIk`E8~`J=BBdR}qD%n@vVG1834)!pS1Y?zVkJGtsa(sB~y zNfMYKsOJb%5J(0ivK8d+l2D2y&5X!cg3BG!AJ}910|_${nF}sC1QF^nLIhzXk-Y#x z0)&1iK!O;Og0Ky!;`b~v%b$`S4E&fB)1NB4v@8wr( z&+NX4e^&o)ecb=)dd~C!{(1e6t?&9j{l8%U*k4)?`(L3;Qjw z#w7FS+U(94MaJKS!J9O8^$)36_J8;thW#2$y9i{bB{?M{QS_inZIJ!jwqAbfXYVd$ zQ5fC$6Nc9hFi8m^;oI-%C#BS|c8vy+@{jx6hFcf^_;2VRgkoN(0h!_VSGmgNPRsxI z8$rTo0LaYq-H5i&gtj81=&xU?H-Y2==G@uQV7E`@+2E9XQW@{&j`?EOktk|Ho{HU>ZqDzvgjwBmdex z&uZNd2C1h{{}2k6Ys9$*nFP3;K%u!MhW`uZy7Sn`1M1zs@Es&;z*Z>Gsh@-3Fe6pE zQD2@cqF((NrRevgvLsvM_8;;iNyJ5nyPyy?e!kvKjGj`6diRFBEe49Oa7wwkJFV7Z z$YT&DWloYu-H?3<0BKn9L&JYDT-SK~*6c5pi18P26$JESKRYj{T7Zk6KiRJcbvOO*{P56Q6s8msbeI3>|j>K9}Q9UBeq*inXKemCm`-<5|-$ZyN4u$(3 z&HcvqehFD%5Yrmykg-^d`=BSa8(i=>ZoC77^mWY{evp(km@aHqhUECBz76YiR+VYK zY_avFC~V3$=`6C4JhfHAQ@DZtUOwH`L;oYX6zK0-uI^?hS$ALfq}A7evR;ohJHij} zHSZdW?EKv9U1s4oD*<(0oQ*;MaQ6@cvGL zuHCPgm_NhVsgp^sfr*ia^Db}swo1?O(_Q2)y+S$CBm+g=9wCOUPbz(x)_GbaKa@A7 zuI&!ynLiZRT#V%_y_-D`0Z5lT*auoe{(U5NylTzFSJW()W-#F6*&A`LNO1bV#Y;QJ zSbLBnp|B^dtK|KIWC|No>JjWBWE@n7O)x{&^E(WMeMvp57#qA8m* zeTow*U@_86B#Fm*rxyYu5PRWaWHx8y> z*qmHEp(AMDl0v)ij(AY8fnH=~ZwwjVAbu*m5;xPfidh@ov6d8g zfJsi&!QyK53Es%sC39ts;54V68koALD4b|%tNHW0bIkZAJKa=W&FomJSEDT>W1xIX z1x%Z>AvNIsSPLcn3RTcHXb@KB?cuM)=x6fcIx>&(GxqZ8w3p#jJ(GVgc*`c0HG}dv zIop&Qim!K1NFwic%07KcjWgHBPUkq7f~lj;TPqVGTiT#cUeim>;nY`>h@a*S{qQex zQ`z62WK|Mj)Y{tfF{;T4P;c8$Q|KU?Joh zIkA^z%X7z|r>4aTh@|StTi!-r1D!g=zb#3d#{{&K3CqE$Iz-UH<%37c zRfkO`&uM%#AD3PHv`g5t0e^O%nVL0d{Xlx^EjEC3#skF@`zl-7PF^0oxW)1!C!JxR zWvuAHH?)61FKA1QeT*_sY7;_Id#!GmV4n`MO{~sv}VLSK` zXRw=Y=Clz*00B(5y^K;gCZMAzjT5+c3IC=)l(9VIDdatpxj3y89WwI|bH&$!ZEvp` zPR!T@#!(|KfI-w?!&+7$N3F6>tD{YO4Qg$d_`nNEdfVCha9vaPn0jI0`)`@*72hq! zpU5ND^P*RoEkbD5o#az(-g=Y)L>HH>Oc%}$ zT3Rs_ih0;4+Lv4Y;@Iv(;fUbQ=i-G(#>vghec~*j(I#r|5mqFiJBpzi&hzEcD{u$< zRsm0BVYn=pT;0>R(itW|*D&;O%bOc7et9ACaH#J>z3A1A~6fdP>pmbM%xzm4>|;c_?B+%sl;Qs2{t!60$^u zH1t@9^6>;?!FuusnISi$f5CL&;z?EqJN$FBuWDA#D5`cy_UvCFIVvf{c?4N0teh;d zET$7aVbj08KTQS!x?Nd1Is8q8qFzs}a=!@nJ;7FSfCY^T@D-gpw`w<6e#X3+;O}1h z$%I!M)0bg|EKUA04Qjn@+x{Rj8vt6Wn!R|3A92z}^$KfF5(#CWr4y#~re1CN4i4w0 z#GsypBR{xA3Er7sgAi(|}1-W?s~n$7?K|9WL8kpVfw-;#b9 z+mn;=ep!162U5R>_t}fOt~tE?s#m( zO-S$7>Ay6*hHdZ)7_oU915WYYCIX;hFI-U2EWYX!pllONr@Q--2o~`!isi6vTPLJ4@(|o=%NHYjo0_S&q*UQIROw@*N-By@PaQ&;YxFZ0aR zX&}LeOEz);#m~Hwm^VAY8DK}b$F4bo{jMN?d!lxKPhNklzr^Cd`0f4oJr^z=I|l`* zm8AHm*fPV`0=lF3Pnnp}&J0N1X@}-D94YvmUabFrLGSnTz7Mu^21F#O5tN#CuY9Vh zUZBH=ez%h*wkf0hBtXJh1SN3d+IF{gzT7lp)j}n?03lt;XSQRAh7qd&v;RwTYDuQ# zbI2*r<>?x-G0@hM{;%{VBD7nLKt~D`T~-HAt5;h%i0_=Ifs=yHma5dhJ+QMG?Ux(a z|E?1CMy1!~oA`FP!k~iG=t&5#>bVdz=peT8HMB6Y)#7PpETtNryT^+Rv3vpJaF^zP z{H}0-LyV9Fu21ID%wO9f1IKlFr1p4c{o-?03vyB-tr5duk^&L$;m_|f$vs`^Sl{j2 z95}oY{LlY+=ZS%J+tZoXCd0*sSU7w^gjovXn+g7uyra5{cU49@yHf#Z^Jl-$9cIfo z+AJuxH$VLb=#+uBbVmUjnx zxb1pZ@-O9=AIk4@S)m6fJ2?{HrNYwwnL3a45muuNjr;6$O`bGEM0T4A2_S$t=86*- zcO+0mywg*j#A4mU}enR_!cGmIYQ;qwfchWtFEXL)AK%*;=j znYne+hS4EMy3S)C*mZ1KI>!+)0V@9!N6H$Y}~MJ{rYuf zz^KljIWvFi-?#?V@LPR&c6Nn{!=XM z>}-h$S76;$H{E{Y%@^zlmOl^efBwa%UU+jJD9UVukQ3ti_kH-?H*RC0?M1W%FCvMB zM_+v6fk$6X2sx)-p~B3&Kl{nscK}pNLM*qjtpaf9>AU{-iPKQZR8yCg!TY}Qg*(;) z)gdvCcB%kppZc$VdvsK@)3l1{&DG!d_6OHOS`y=ITLEVu`unSKA2E%JD*DVX{LJ}K z9l>hMRDqxQh0lnpGHpVYneX}eA3Pt|2v%=q;rt)``R|#bDyB)OXY&vI_@|*}h}G?^ z@aZ4_!7cQPX`!fW_?{oT1NTwHs#l5L-0`E|y@48<3Q^HFf8=Idi zpJYD%1MkII!~|7I^WGo)IF=?{>ACnjJ_WUi39C}!Q{QnheVJqeKKqq5^o5CBde(g9 zvw$X6^jz_^E2$wSw4!q5*RG(C2_^XO$HBn_55vbl44OnTTRwRaePP0vo{K)U1#99& z<>rq7V&V(<&@I%MFoN5zrY}sz=(*-L&}1QQ*a%`u25h{cFj===17eB_uGuzG&byQ< zrm8BJZl4r_E$3k|Wo6FW0-6M7>qac5uFQsQcmkLWGfeH74S3Z_rJ!jgN++!@i=HW8 zkyjI(oPH-+-N#Qc^-mpNO`bc6r=2-<%&Wy5K1vfFJB(L_IkpS6fY^NmuL8qsgj>MD zn~BHH9WM~32_3vd=W&B)k7F9q%stJx+b_L_X-4zr^LVUMCmyCTA3sWtkvsmME?Xiy z?xOSfB=_$oY06~J-HcCq&)qcW{j;uP;?Dm}=hkq?zh&n!;m((-G-u_t|6x399Q;>A zgNpxoJNj{u|MFDH7Rhq@FCAl0dE|ddnl!oh9{Lq?@JDoR6L;C941IK`ISfdE$4S zE0AUQ8+2|Ncl_q5QkSp#AODp~(^mfP&%Au@@|TBQwoP`UU+V{6u8|)6ZA{~uKmQ*M zmrMTDU8S~8Eqi{^v0Ug&5Upcm#y7Z1(RbgZAG8jB$eRwCspQ)>5;U)oGZ&E5aeR*K z8Yt`Y0$G))Yd(Y3KH}tA4`-_QmNke5hU_|nq=xtyjwW(_o?itz>B>WM&^63bNdQ)k@-IgDHW*RW$Xo9#RzrTrCn7L2H{9Amq|qNg@#eZY=|P zCoI?2s+L)zsM%WX(NbVEY^`C>lFjIBYmJ6@DKJ0ZT4&F&WHW!dwa%QzOG!?jY_2(S zDcEzZbz*2Q!43|z))9yOP9X1Xt%DXzwY(3tl-TR=Qb_MbZYRrooh;dYYmS!U_as1(=YVB?Q_A|tNu5Ut&_q3jbfDM zoFxT^uEuH`nX3*sB%K?GuHUkweYReBwnHqh3P)~`+s3+Tj!rDA1e)8vuBv5J*IsxC zkd^~b(aGzArj08{>cnzOuy04C+C`}gb|Yz-1avxeWzev3NzcHbz_&4W@QCr$z3~w=8Ua- z`;vfG1~BP8CyLb=F7t1am~ph_#|O%$khSJ9%Vtcn)YmpgQxF?xM^_Vb+5fnpB^W0I`f%X8gb9#X{Q-yJG0{Z56aWeI&zPxnf5pdJA38bM`cYnS#x)% z`n1tFf$i)W-hGm(f9mde^=X@NcV_lFb=P`4&CI&H=IArijGwdCk&X@uQ$5xmj!~^? z#$ROCI)V-~t%L%GS#wo@U27ddR`4`3)WoB{R-4snfNrfee|kI8^bu#yDgYqOwas9# zmcb`3!kRJ`Cr=_tq)8aMt{aGtUZsqwVlj6DgCGre>AEt&x8H_in!x@uwgExIh|-mA zjdaC(29~CTVSaaF7HPbql&*9Uo8P@f)>LqCXclr}peS7_1BQ28u9PO8Eq1@`l3q9o zkfKCaO2?T?ZyA6loW<#9_c^O=m<&h}CA!ineAD@=(gbq`vyT|tiJ6#^B1$P;;qax` z55k&Q?wEh#87niLo*+n4L@65J(Nz~=Ya%7^(miLb(E>A3B@|Jjl;FU&D>o|9#7PJH z?|ago!o;WC^h=|T7PVBg(DAB}72cyUS zb(f>Bwbr!F1eTCO5fpj<{PqhY5>143p?~5ZA5H40);=@M#MYvrB6gqHbU_!GSY??i z%s=>-ciA4*zOOZHds0a(kWewZ4h(k8h(ua7HX)Au&mY~H8KY6(_cb$_&fA@QjIW-*heP3%$d!m5^AdnT}`12qA^c@!g3DOwZ5WwE2?)-yU z!)Vx#Mtxt?FzFTwK!77sy7)sMzUd->w4^bxtpM2j!b1pjgyk zGKwWGeb4)^zjy{9Es&PU1}gwg?|J#L$KJB7ett9@4M%-nGtIQr0>Fl@8-yh`-+1ed zS6r}(MeSvgSoFmH*_WPu@i?}!AB~2?;i&IxrkNg~cQ9Som98tcq)k^|eeER|Zl77t za-TVUc;DNvzVXJ%w52+#weN?+;i#{f#!Oc&z?81*N>^e~ltRS%ZI@lR{rs()HmqG! zx*}ZrI-EZ}ckJMiy>A^oofwDfC~IH)z8{VHKGT@#E5I(Ll&+MnMCl>~AV7+>Gi%mF zkU1QlKASdR0B80!YhP<$Ywi0?W2Ux45oPfxv9QolWzJPD^weBfvo4SONxP35106sAmh(e+vAs0GboFD@PvNs)jNPvarhW}0YliZEg{Gazv z+JDIpoojRVPr<*C|BTq<`6ga{5q^8^!|0cxe=rZ!zxH3%f5ZO0cQ*Z<^$Yt2{|Ek0 zyT|*F+CO@K;(owBKtGg!S^xj-Z~rga2m6nxKl9J=fBSuNKW_dLKWhJKeg^-Xe`^1? z`TyJj)8E!#>_3Y?uKrwqq3LJ#SGU>AzUO|6`nR^u&3FNN_jGOc zw)Nw`wr3yIKhgcee6IaN=ws>M{6677%)hPwx&HzC(f&u~&)6@b2kNRzBDQAP0*H73 zq%McOmRk{B3i47qRe=DA*$&odrbEJZ*pV9XXa&p@wlW~@Yfs>V{yiTtplMhgM*-Bz zsSnlq&pG;z0OUN%$~$3=g1UF+G*>+17eRbBf3=y79J}KR8owon@$1Z7MIrvvWWH)34nK2SD)GsrJ{l z1Cl#oVo3A8qY3e=aF)qzms~FG#2$LzT=gs&aVMOj>(%{y<&O0cG!nCiESl~x=^dF{ zKvj8F1K8Ng171wwM5Fh4KoQw`_c6#y$(5cAm7e}~nJ#A*fx+c9;y#&W!#VukR)ugk zKp3=+;Ut+IYn%m+r4d*<`L2h%aDnX5}^!5R|H;(34AoVWjRx(msBZvk;rCI*|~ zdOijqI@9Z{Vu!~jvHW{lBa$rnl4+!s_5sfK3bCGk-B%iDe&@-}+%fOKU|(9?V1 zHE8&@4z)Kx!RAvAs z!Wic9=o#(bg?kc-G68-m(jZ`^=XGUXb)}t(%&~sjFnV^sEX%hSy6UKC4iOhgV=BHV z2w`4g7Y=s#Vu2B_?#VQ|hP39@eArgfX>-0S+dd&^mx0*wp}>)x;c4RUgxz%;oNe?& z-7-lJ@Y^2^C;=qJsxx5|xF)*pTGhch2B&kxtn;f!7=gznk}I3}Dh}(CoMXgA5-p&kS202!l?!fT3t|HG*rIP~mS* z$Wjo}jq3}z$Qq!9yrtd3fM0N629ZM?LU$nv@Tv9b7I;D|;0H2dsA~g7Z7zp1| zB)XmrkMgF6OQr|R)HHD^TE{Y#j!~SR?b`Xt3Qs`B+x<hxexYeAjMUWdZ-*n9%(1)Wb(n2U<><7&9dwGJmrob)4%H? zlQ%z+L-^$dFhhH|@u$%97Qz?*Ynh2VG@q|?8vY&L74&fs&_b&3$x&Oyjl~LQDRRap zJU4U*R+(2Dd!G+lh8!V{pT_UJn+^1Qg6$` zqkNm(a#hWyc6SP+p5=C4HL8-m`pO`5o~`-LI?_h5CsH?F_%?nDodmz&pWR20WTpJE z?N|wSzLjMUK8E)a2tI}Lf;+;*M|h3Y(U#>)g1>zk9|Hd}oZAa2 zLYBWBoSW!Ts!RwXr^8h+U*@{9{zqS^iH)Op<;r`Uw~nc}<^$V~_i%$GFjaG?X1@E|M`h)nekvFKt`Dh-f>@|0-`Xoq)o` zx;JmzDfOV9qCx|EVpogEe0LK~tGS?5$$L_i6P$P6wIsCQaP_;d{{N=iV@+8LI}o#( zvo*Ejy=IIn{rdIQh1&q-{EuohpVOjJ^Q3lD*YTp37$^RRgn8ihpdu5{Ct%5-KO!VL zcNB6dUajXI9jkm-P|i3~GB-A(X`P1Oqqb$tcku)UJw0w3GeUijb__#QT4j%64z%EeB7S?jlWwx_7&+EEvB|6N=kV}DwnyAlX=?j`) zmU#!$*^@NIu#n_d7;WoJV@*Fbv9|yJO4;n|BNF2xy(54RyB>t~8lUOUW$&2%Nwi1y zx6JxW88>U2$#qhl^6KUbtmg9}D0o5vYDT7kWJthLGkpGnN4T>{St^_EU>4;DmLF9o zr|LqsA8_MoNLQ=}w?8u!ziSZ@PC#Y<#9uJFo-ozVo6D;<8j^1$c|qAE3ZTE5i~zmE z$BU5lw6l=EWsg^y^;8>r9qH{xfL|~PZYK#md$zZ0?o11gV<*WSW~cgy2GYGQir%wf zt4iW8D+;s*;RGrmd(-T<@2&j(Cb9xhV*l-x`TpK`xq|7p?5R%5*s!69?2c!cC*VY* z2DE^9pvOPLU!1e}wA8S8opcTJ3`NB>hY=JQnL~QFXR4K8A$BqJnoEB$wn-%u@E6Mh zCfMF4kusv3N!(aHC}4)Xs^xoOwXd%e^6pi5|DZo=Q25j+6HlJ^7FodH6y1bMROR^q zGu6)fopS`h%Sw<;ZH%TEPf+#81-#_v+@8nlR0jLcIDKQtLleOC)6yLZgC!D9X3GgS zohwU{v$jl=quD#Go^hB{`@Qw*a%`(^jyT~=q^bWgGzRj;|12J55HWdCWV}EB|K=%N z3Nq-qxJJ`>^|1MNN+q}zTB&ooE3j==AgK@^UW<^oSbeALa2peF)Th6{@sj0KyMNHZ zksk1+MXN2tv+22A%cQOGpS9)77(uP9mh+!5T5ERLvF@b}$+WvXM45Z?-kCa)fb~f1 znVbTD$Gx-0Zxc`0D@YgHakge6SL0H`-vN_x?AP0>iGH0_EE&=v83hMJgaKAI0jJXm zVxVz;X<$v6WW7}fxROO7vr#YLP;;lij5VrX{;>7kK6TtOH&6|Ar^xo>00%+u$C4@# z>!jOt6*3><171+WxoZnKDTzJtDRw+T030;yI}~uV@9fCnei^I*j>Bp&mzP2d=FPb_ zCM*l_+$LDR3B*a!A$g#>xsrZvw0lckxmMg>0aQd7tPyN=t{dgXb;Ie+T8{fZH=gdu zM7Rg9c(kg(Jg0?ARRRl=AONFKrvFj)lTY$KfT%6^6s`mk*ABGhsce*LsoD>K{z_M2 ziPpnu+lw22PfF!CoId^6n*G4H(Ix+#+N{C(da7t1BYMGEaE#PdpOLxsVD5riQXHp@OX;`S`8VnpM~)I920w~<3|mo0 zf8~Az`*?2?H&gZ&*K&bRkV@qzvMlRHXys8*Ze2+1c?5o!^+$&MHxB@4Ee5cke52R! zmn7AZtY6ST%ixgU5)%$%QcwHj7Es-Qu^kLAPwy%7pGBw_4Q9#da^W2$}axNHr03)_nw z5?yuNmXrI5HgS46)c5&}B)Tts49oU92>3xBLLy}FMUW=84DQbVq^;7_e7|(Sdz|&J z73N+M`rc2rt*oSWu#7S{*s~nH6HRHJS1SmzeXk|;CA)FI4bat3<%}nkB%;;?=F>B7ms9QSxv#@+69;@>QaR?REYX4&)=itG>rM{<{A79Rmk)`5ON#GL`*KX%}Ihk3w(RtM-WLt z?f&FLF}4N^yE!(pZ&Yj&Bc`~K0@4_}*0Om?wN|}4WJ>WL;G^H2*QpgEkGA~OET-Km zkwz|5{6dnz1U<2Pe9DNL>3g5FEIvp1jzP&2K#z~j%g6!7B;^zF+o95?fV{3mnB8*RMhCDNp>Am-3e@jNfMj?jHV$MWjk!DDKP zkAz$Y?Sr)!GUOX}qTQ5aMh|wq1uq}~joWyKl=b_LboM#wi{CMuz5x6BKlA-qy++cM01D3b7`uD z#l6M4pI;JCypO8JZ6?U&wNxR!{4oB_ zlV!x9+-&Qy6{%MQ{~yoZGkKiTSC`YS_j22~G;xUV855g2&C(zm^V!(wpcm@zn{%!g z4}JGo(sGZ1O~to-}le

UmY2RIYtNPVDpE$%vda+HD#3m z&VuXJ{BK&Qe+rBa7eq}Q(bq|tn(RrJAk|ztj2(i{d>nmQnM?;HF2k&9sA6up5tmjl z7lySlzMbifH17-m-Lwa_F&e7nOH?ESi3#ckR3tsM+jsck3`oG!uMS}|eAwVXv>}qxwq?QY%QJ0}r@^;fhuUA9W z*BVl>TGo&N004@xSiwDUXUvp51sVmqO3m)=B55aPwf@0=e}cN+$-BdKxY`YrT_4)0 z_d10#i44Q*rFr8MC>*)v$EJvz``(pb{e&*6k+b zsMz%($|1+8hn8c2?P(l@;Rb&CsZeYoCI3?2!LqjbwPXW3z4G$Qfj=cT5Yb%vY0(AX oeb?AaKtwrnc|$|zzw9vfvn^aJJ!zd)XFXqqy0000001=f@-~a#s literal 0 HcmV?d00001 diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml new file mode 100644 index 0000000000000..ca1931bca99e3 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml new file mode 100644 index 0000000000000..7a9d314e2969b --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + LlamaAndroid + diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml new file mode 100644 index 0000000000000..8a24fda56602c --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +