From 7db9c96d8a5f685102eaee72a4b6a3f4f6486fb3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 15:39:58 +0200 Subject: [PATCH 01/18] llama : add llm_build_norm helper function ggml-ci --- llama.cpp | 435 ++++++++++++++++++++++-------------------------------- 1 file changed, 176 insertions(+), 259 deletions(-) diff --git a/llama.cpp b/llama.cpp index bad25de4b84a1..c236277d437cc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -972,7 +972,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void ggml_offload_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { (void) tensor; } @@ -3093,6 +3093,42 @@ static bool llama_model_load( using llm_build_cb = std::function; +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + +static struct ggml_tensor * llm_build_norm( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + float eps, + const llm_build_cb & cb, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break; + }; + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx, cur, mb); + } + + return cur; +} + static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_batch & batch, @@ -3192,14 +3228,11 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * inpSA = inpL; // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { @@ -3307,15 +3340,11 @@ static struct ggml_cgraph * llm_build_llama( // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, @@ -3349,15 +3378,11 @@ static struct ggml_cgraph * llm_build_llama( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -3466,15 +3491,11 @@ static struct ggml_cgraph * llm_build_baichaun( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { @@ -3600,15 +3621,11 @@ static struct ggml_cgraph * llm_build_baichaun( // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, @@ -3763,27 +3780,21 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * attn_norm; // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0", il); - - attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w", il); - - attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); - cb(attn_norm, "attn_norm_0_wb", il); - - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = ggml_norm(ctx0, inpL, norm_eps); + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); + + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, attn_norm, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, norm_eps, cb, il); cb(cur, "attn_norm_2", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); - cb(cur, "attn_norm_2_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); - cb(cur, "attn_norm_2_wb", il); - } else { // Falcon 7B + } else { cur = attn_norm; } @@ -3925,16 +3936,11 @@ static struct ggml_cgraph * llm_build_falcon( cur = inpL; // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4024,17 +4030,11 @@ static struct ggml_cgraph * llm_build_starcoder( cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); { // Self Attention @@ -4130,17 +4130,11 @@ static struct ggml_cgraph * llm_build_starcoder( // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); cb(cur, "result_w3", il); @@ -4161,17 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder( } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4206,7 +4194,7 @@ static struct ggml_cgraph * llm_build_persimmon( const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; + const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; @@ -4271,16 +4259,11 @@ static struct ggml_cgraph * llm_build_persimmon( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); // self attention { @@ -4316,22 +4299,16 @@ static struct ggml_cgraph * llm_build_persimmon( cb(tmpk, "tmpk", il); // Q/K Layernorm - tmpq = ggml_norm(ctx0, tmpq, norm_eps); - cb(tmpq, "tmpq", il); - - tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + tmpq = llm_build_norm(ctx0, tmpq, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, norm_eps, cb, il); cb(tmpq, "tmpq", il); - tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - cb(tmpq, "tmpq", il); - - tmpk = ggml_norm(ctx0, tmpk, norm_eps); - cb(tmpk, "tmpk", il); - - tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - cb(tmpk, "tmpk", il); - - tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); + tmpk = llm_build_norm(ctx0, tmpk, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, norm_eps, cb, il); cb(tmpk, "tmpk", il); // RoPE the first n_rot of q/k, pass the other half, and concat. @@ -4480,17 +4457,11 @@ static struct ggml_cgraph * llm_build_persimmon( { // MLP - { - // Norm - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); cb(cur, "result_w3", il); @@ -4519,16 +4490,11 @@ static struct ggml_cgraph * llm_build_persimmon( cur = inpL; - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4609,15 +4575,11 @@ static struct ggml_cgraph * llm_build_refact( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { @@ -4719,15 +4681,11 @@ static struct ggml_cgraph * llm_build_refact( // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, @@ -4761,15 +4719,11 @@ static struct ggml_cgraph * llm_build_refact( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -4851,30 +4805,18 @@ static struct ggml_cgraph * llm_build_bloom( struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - // norm - { - inpL = ggml_norm(ctx0, embd, norm_eps); - cb(inpL, "inp_norm", -1); - - inpL = ggml_mul(ctx0, inpL, model.tok_norm); - cb(inpL, "inp_norm_w", -1); - - inpL = ggml_add (ctx0, inpL, model.tok_norm_b); - cb(inpL, "inp_norm_wb", -1); - } + inpL = llm_build_norm(ctx0, embd, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { - { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); { // Self Attention @@ -4984,17 +4926,11 @@ static struct ggml_cgraph * llm_build_bloom( // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpFF, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); cb(cur, "result_w3", il); @@ -5016,17 +4952,11 @@ static struct ggml_cgraph * llm_build_bloom( cb(inpL, "inpFF_+_result_w2", il); } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -5109,18 +5039,15 @@ static struct ggml_cgraph * llm_build_mpt( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); + // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0", il); - - attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w", il); - - if (1) { - cur = attn_norm; - } + cur = attn_norm; // compute QKV @@ -5230,14 +5157,11 @@ static struct ggml_cgraph * llm_build_mpt( // feed forward { - // Norm - { - cur = ggml_norm(ctx0, attn_out, norm_eps); - cb(cur, "ffn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - } + cur = llm_build_norm(ctx0, attn_out, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); cb(cur, "result_w3", il); @@ -5258,14 +5182,11 @@ static struct ggml_cgraph * llm_build_mpt( cur = inpL; - // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -5378,15 +5299,12 @@ static const std::unordered_map k_offload_map { "inp_norm_w", OFFLOAD_FUNC_NR }, { "inp_norm_wb", OFFLOAD_FUNC_NR }, - { "rms_norm_0", OFFLOAD_FUNC }, - - { "attn_norm_0", OFFLOAD_FUNC }, - { "attn_norm_0_w", OFFLOAD_FUNC }, - { "attn_norm_0_wb", OFFLOAD_FUNC }, + { "norm", OFFLOAD_FUNC }, + { "norm_w", OFFLOAD_FUNC }, + { "norm_wb", OFFLOAD_FUNC }, + { "attn_norm", OFFLOAD_FUNC }, { "attn_norm_2", OFFLOAD_FUNC }, - { "attn_norm_2_w", OFFLOAD_FUNC }, - { "attn_norm_2_wb", OFFLOAD_FUNC }, { "wqkv", OFFLOAD_FUNC_KQ }, { "bqkv", OFFLOAD_FUNC_KQ }, @@ -5614,20 +5532,19 @@ static struct ggml_cgraph * llama_build_graph( static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, #ifdef GGML_USE_CUBLAS { OFFLOAD_FUNC, "GPU (CUDA)" }, { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, - { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" }, #else { OFFLOAD_FUNC, "CPU" }, { OFFLOAD_FUNC_KQ, "CPU" }, { OFFLOAD_FUNC_V, "CPU" }, { OFFLOAD_FUNC_NR, "CPU" }, { OFFLOAD_FUNC_EMB, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, #endif // GGML_USE_CUBLAS }; From dbf836bb641c499ad898d8b90d71b7b7aee72d61 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 18:47:46 +0200 Subject: [PATCH 02/18] llama : add llm_build_ffn helper function (#3849) ggml-ci --- llama.cpp | 376 +++++++++++++++++++++++++++++------------------------- 1 file changed, 200 insertions(+), 176 deletions(-) diff --git a/llama.cpp b/llama.cpp index c236277d437cc..cf3ee494af465 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1116,13 +1116,13 @@ struct llama_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1; // ffn_gate - struct ggml_tensor * w2; // ffn_down - struct ggml_tensor * w3; // ffn_up + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 // ff bias - struct ggml_tensor * b2; // ffn_down - struct ggml_tensor * b3; // ffn_up + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 }; struct llama_kv_cell { @@ -2538,15 +2538,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2604,15 +2604,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2683,14 +2683,14 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2756,11 +2756,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2768,8 +2768,8 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); } } } break; @@ -2816,22 +2816,22 @@ static void llm_load_tensors( const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; case LLM_ARCH_BLOOM: @@ -2899,11 +2899,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2911,8 +2911,8 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2); + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); } } } break; @@ -2969,8 +2969,8 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2978,8 +2978,8 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w2) + - ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up); } } } break; @@ -3129,6 +3129,107 @@ static struct ggml_tensor * llm_build_norm( return cur; } +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +static struct ggml_tensor * llm_build_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = ggml_mul_mat(ctx, gate, tmp); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } break; + case LLM_FFN_PAR: + { + cur = ggml_mul_mat(ctx, gate, cur); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } break; + }; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx, cur); + cb(cur, "ffn_gelu", il); + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + }; + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + cur = ggml_mul_mat(ctx, down, cur); + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx, cur, down_b); + } + + return cur; +} + static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_batch & batch, @@ -3346,27 +3447,12 @@ static struct ggml_cgraph * llm_build_llama( LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, inpFF); @@ -3627,27 +3713,12 @@ static struct ggml_cgraph * llm_build_baichaun( LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, inpFF); @@ -3911,16 +3982,12 @@ static struct ggml_cgraph * llm_build_falcon( // feed forward { - struct ggml_tensor * inpFF = attn_norm; - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - cb(cur, "result_w3", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, attn_out); @@ -4136,19 +4203,12 @@ static struct ggml_cgraph * llm_build_starcoder( LLM_NORM, norm_eps, cb, il); cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - cb(cur, "result_w3", il); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - // Projection - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_result", il); } inpL = ggml_add(ctx0, cur, inpFF); @@ -4455,31 +4515,20 @@ static struct ggml_cgraph * llm_build_persimmon( struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); cb(inpFF, "inpFF", il); + // feed-forward network { - // MLP cur = llm_build_norm(ctx0, inpFF, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, norm_eps, cb, il); cb(cur, "ffn_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b", il); - - cur = ggml_relu(ctx0, cur); - cb(cur, "relu", il); - - cur = ggml_sqr(ctx0, cur); - cb(cur, "sqr(relu)", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, inpFF); @@ -4687,27 +4736,12 @@ static struct ggml_cgraph * llm_build_refact( LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, inpFF); @@ -4932,20 +4966,12 @@ static struct ggml_cgraph * llm_build_bloom( LLM_NORM, norm_eps, cb, il); cb(cur, "ffn_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_result", il); } inpL = ggml_add(ctx0, cur, inpFF); @@ -5163,14 +5189,12 @@ static struct ggml_cgraph * llm_build_mpt( LLM_NORM, norm_eps, cb, il); cb(cur, "ffn_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_result", il); } cur = ggml_add(ctx0, cur, attn_out); From 38728a0be0efcb603726fc78a1356bdc3aec910b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 19:22:54 +0200 Subject: [PATCH 03/18] llama : add llm_build_k_shift helper ggml-ci --- llama.cpp | 130 +++++++++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 64 deletions(-) diff --git a/llama.cpp b/llama.cpp index cf3ee494af465..b746cc14f2d2f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3230,6 +3230,65 @@ static struct ggml_tensor * llm_build_ffn( return cur; } +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + int64_t n_rot, + llm_rope_type type, + const llm_build_cb & cb) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_head = hparams.n_head; + const int64_t n_layer = hparams.n_layer; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + + const int64_t n_ctx = lctx.cparams.n_ctx; + + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; + + GGML_ASSERT(n_embd_head % n_rot == 0); + + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); + + int rope_type = 0; + + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; + }; + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx, + ggml_view_3d(ctx, kv_self.k, + n_rot, n_head, n_ctx, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(graph, tmp); + } +} + static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_batch & batch, @@ -3308,21 +3367,7 @@ static struct ggml_cgraph * llm_build_llama( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3557,21 +3602,7 @@ static struct ggml_cgraph * llm_build_baichaun( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3830,21 +3861,7 @@ static struct ggml_cgraph * llm_build_falcon( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4243,6 +4260,7 @@ static struct ggml_cgraph * llm_build_persimmon( GGML_ASSERT(!!kv_self.ctx); const auto & cparams = lctx.cparams; + const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; @@ -4250,7 +4268,7 @@ static struct ggml_cgraph * llm_build_persimmon( const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const size_t n_rot = n_embd_head / 2; + const int64_t n_rot = n_embd_head / 2; const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; @@ -4297,23 +4315,7 @@ static struct ggml_cgraph * llm_build_persimmon( cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions. - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_rot, n_head, n_ctx, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) - ), - K_shift, n_rot, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5534,7 +5536,7 @@ static struct ggml_cgraph * llama_build_graph( #ifdef GGML_USE_CUBLAS const bool do_offload = true; #else - const bool do_offload = false; + const bool do_offload = true; // TODO: set to false after finishing refactoring #endif if (!do_offload) { From 909d64471bb4cc26ba1a3e7db8361f4796d874d6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 19:45:27 +0200 Subject: [PATCH 04/18] llama : fix offloading after recent changes --- llama.cpp | 52 ++++++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/llama.cpp b/llama.cpp index b746cc14f2d2f..92ef6e4ea5208 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3185,6 +3185,8 @@ static struct ggml_tensor * llm_build_ffn( } } break; }; + } else { + cur = tmp; } switch (type_op) { @@ -3761,15 +3763,11 @@ static struct ggml_cgraph * llm_build_baichaun( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -5374,31 +5372,25 @@ static const std::unordered_map k_offload_map { "inpFF", OFFLOAD_FUNC }, - { "rms_norm_1", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, - { "ffn_norm_0", OFFLOAD_FUNC }, - { "ffn_norm_0_w", OFFLOAD_FUNC }, - { "ffn_norm_0_wb", OFFLOAD_FUNC }, - - { "result_w3", OFFLOAD_FUNC }, - { "result_w3_b", OFFLOAD_FUNC }, - { "result_w2", OFFLOAD_FUNC }, - { "result_w2_b", OFFLOAD_FUNC }, - { "result_w1", OFFLOAD_FUNC }, - - { "silu", OFFLOAD_FUNC }, - { "gelu", OFFLOAD_FUNC }, - { "relu", OFFLOAD_FUNC }, - { "sqr(relu)", OFFLOAD_FUNC }, - - { "silu_x_result_w3", OFFLOAD_FUNC }, + + { "ffn_up", OFFLOAD_FUNC }, + { "ffn_up_b", OFFLOAD_FUNC }, + { "ffn_gate", OFFLOAD_FUNC }, + { "ffn_gate_b", OFFLOAD_FUNC }, + { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_down", OFFLOAD_FUNC }, + { "ffn_down_b", OFFLOAD_FUNC }, + { "ffn_result", OFFLOAD_FUNC }, + + { "ffn_silu", OFFLOAD_FUNC }, + { "ffn_gelu", OFFLOAD_FUNC }, + { "ffn_relu", OFFLOAD_FUNC }, + { "ffn_sqr(relu)", OFFLOAD_FUNC }, + { "inpFF_+_result_w2", OFFLOAD_FUNC }, { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, - { "rms_norm_2", OFFLOAD_FUNC_NR }, - { "out_norm_0", OFFLOAD_FUNC_NR }, - { "out_norm_0_w", OFFLOAD_FUNC_NR }, - { "result_norm", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, }; From 3e0462594b2cb687eefd8099116e5c126691bf60 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 20:35:20 +0200 Subject: [PATCH 05/18] llama : add llm_build_kv_store helper ggml-ci --- llama.cpp | 362 ++++++++++++++++++------------------------------------ 1 file changed, 119 insertions(+), 243 deletions(-) diff --git a/llama.cpp b/llama.cpp index 92ef6e4ea5208..0d6c87f04e3e5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3291,6 +3291,44 @@ static void llm_build_k_shift( } } +static void llm_build_kv_store( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_batch & batch, @@ -3385,40 +3423,22 @@ static struct ggml_cgraph * llm_build_llama( // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); @@ -3619,53 +3639,31 @@ static struct ggml_cgraph * llm_build_baichaun( // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Kcur; - struct ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: - Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); - Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: GGML_ASSERT(false); } - - cb(Kcur, "Kcur", il); - cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); @@ -3865,14 +3863,14 @@ static struct ggml_cgraph * llm_build_falcon( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); + // self-attention { - attn_norm = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(attn_norm, "attn_norm", il); - if (model.layers[il].attn_norm_2) { // Falcon-40B cur = llm_build_norm(ctx0, attn_norm, @@ -3885,7 +3883,6 @@ static struct ggml_cgraph * llm_build_falcon( } // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -3902,52 +3899,35 @@ static struct ggml_cgraph * llm_build_falcon( // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for // non-contiguous views is added for the rope operator - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); - cb(tmpq, "tmpq", il); + cb(Qcur, "Qcur", il); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); - cb(tmpk, "tmpk", il); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_view_3d( + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv", il); + wsize * n_embd_head * (n_head + n_head_kv))); + cb(Vcur, "Vcur", il); // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Kcur, "Kcur", il); - { - struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur_0", il); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur_1", il); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); @@ -4118,40 +4098,25 @@ static struct ggml_cgraph * llm_build_starcoder( LLM_NORM, norm_eps, cb, il); cb(cur, "attn_norm", il); + // self-attention { - // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(tmpq, "tmpq", il); - cb(tmpk, "tmpk", il); - cb(tmpv, "tmpv", il); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Kcur = tmpk; - - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); @@ -4441,34 +4406,16 @@ static struct ggml_cgraph * llm_build_persimmon( Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); cb(Kcur, "Kcur", il); - { - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - cb(tmpv, "tmpv", il); + cb(Vcur, "Vcur", il); - // store K, V in cache - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * k = ggml_view_1d( - ctx0, kv_self.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) - ); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_kv, n_head_kv, ggml_element_size(kv_self.k)*n_embd_gqa, @@ -4632,40 +4579,22 @@ static struct ggml_cgraph * llm_build_refact( // self-attention { - // compute Q and K - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); @@ -4852,48 +4781,27 @@ static struct ggml_cgraph * llm_build_bloom( LLM_NORM, norm_eps, cb, il); cb(cur, "attn_norm", il); + // self-attention { - // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(tmpq, "tmpq", il); - cb(tmpk, "tmpk", il); - cb(tmpv, "tmpv", il); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Kcur = tmpk; - - // store key and value to memory - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); struct ggml_tensor * K = @@ -5075,8 +4983,6 @@ static struct ggml_cgraph * llm_build_mpt( { cur = attn_norm; - // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -5085,47 +4991,17 @@ static struct ggml_cgraph * llm_build_mpt( cb(cur, "wqkv_clamped", il); } - const size_t wsize = ggml_type_size(cur->type); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * Qcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv", il); - - { - struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur", il); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); cb(Q, "Q", il); From 59908619386e8ce5f66fec5d76122a3b316e3861 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 21:11:20 +0200 Subject: [PATCH 06/18] llama : remove obsolete offload names --- llama.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0d6c87f04e3e5..abc1cff167cd4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5213,12 +5213,9 @@ static const std::unordered_map k_offload_map { "tmpk", OFFLOAD_FUNC_KQ }, { "tmpq", OFFLOAD_FUNC_KQ }, { "tmpv", OFFLOAD_FUNC_V }, - { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? { "Kcur", OFFLOAD_FUNC_KQ }, { "Qcur", OFFLOAD_FUNC_KQ }, { "Vcur", OFFLOAD_FUNC_V }, - { "Vcur_0", OFFLOAD_FUNC_V }, - { "Vcur_1", OFFLOAD_FUNC_V }, { "krot", OFFLOAD_FUNC_KQ }, { "qrot", OFFLOAD_FUNC_KQ }, @@ -5227,9 +5224,6 @@ static const std::unordered_map k_offload_map { "krotated", OFFLOAD_FUNC_KQ }, { "qrotated", OFFLOAD_FUNC_KQ }, - { "k", OFFLOAD_FUNC_KQ }, - { "v", OFFLOAD_FUNC_V }, - { "Q", OFFLOAD_FUNC_KQ }, { "K", OFFLOAD_FUNC_KQ }, { "KQ", OFFLOAD_FUNC_KQ }, From 31a12f3d03c3ac493b8dc35e70b9a85d7fbd6135 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 21:17:46 +0200 Subject: [PATCH 07/18] llama : fix llm_build_k_shift to use n_head_kv instead of n_head --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index abc1cff167cd4..db5fe067c2911 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3253,8 +3253,8 @@ static void llm_build_k_shift( const auto & hparams = model.hparams; - const int64_t n_head = hparams.n_head; const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head = hparams.n_embd_head(); @@ -3281,7 +3281,7 @@ static void llm_build_k_shift( // we rotate only the first n_rot dimensions ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv_self.k, - n_rot, n_head, n_ctx, + n_rot, n_head_kv, n_ctx, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), From a104abea4884ae6d84eb4f861c715ea94deca394 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 21:24:25 +0200 Subject: [PATCH 08/18] llama : simplify falcon Q, K, V computation --- llama.cpp | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/llama.cpp b/llama.cpp index db5fe067c2911..9ab85c90355e2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3886,40 +3886,17 @@ static struct ggml_cgraph * llm_build_falcon( cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - const size_t wsize = ggml_type_size(cur->type); - - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - cb(Qcur, "Qcur", il); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); + cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv))); cb(Vcur, "Vcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // using mode = 2 for neox mode Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Qcur, "Qcur", il); From c9121fdd0fda14067349ba3bcea10e696537723f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 21:44:19 +0200 Subject: [PATCH 09/18] llama : remove obsolete comments in build graphs --- llama.cpp | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/llama.cpp b/llama.cpp index 9ab85c90355e2..e7862e8f528c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3638,7 +3638,6 @@ static struct ggml_cgraph * llm_build_baichaun( // self-attention { - // compute Q and K and RoPE them struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -3676,12 +3675,9 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); cb(K, "K", il); - // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); cb(KQ, "KQ", il); - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); cb(KQ_scaled, "KQ_scaled", il); @@ -3694,7 +3690,7 @@ static struct ggml_cgraph * llm_build_baichaun( break; case MODEL_13B: // TODO: replace with ggml_add() - KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); + KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); // TODO: n_head or n_head_kv cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); break; @@ -3702,11 +3698,9 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(false); } - // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); cb(KQ_soft_max, "KQ_soft_max", il); - // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_kv, n_embd_head, n_head_kv, @@ -3718,15 +3712,12 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); cb(KQV, "KQV", il); - // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cb(KQV_merged, "KQV_merged", il); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); cb(cur, "KQV_merged_contiguous", il); - // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); @@ -3882,7 +3873,6 @@ static struct ggml_cgraph * llm_build_falcon( cur = attn_norm; } - // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -4106,24 +4096,18 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); cb(K, "K", il); - // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); cb(KQ, "KQ", il); - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); cb(KQ_scaled, "KQ_scaled", il); - // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); cb(KQ_masked, "KQ_masked", il); - // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); cb(KQ_soft_max, "KQ_soft_max", il); - // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_kv, n_embd_head, n_head_kv, @@ -4142,7 +4126,6 @@ static struct ggml_cgraph * llm_build_starcoder( cb(cur, "KQV_merged_contiguous", il); } - // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); cb(cur, "result_wo", il); @@ -4506,8 +4489,6 @@ static struct ggml_cgraph * llm_build_refact( const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - // printf("n_kv = %d\n", n_kv); - auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -4584,27 +4565,21 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); cb(K, "K", il); - // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); cb(KQ, "KQ", il); - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); cb(KQ_scaled, "KQ_scaled", il); - // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); cb(KQ_masked, "KQ_masked", il); - // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); cb(KQ_soft_max, "KQ_soft_max", il); - // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_kv, n_embd_head, n_head_kv, @@ -4616,15 +4591,12 @@ static struct ggml_cgraph * llm_build_refact( struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); cb(KQV, "KQV", il); - // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cb(KQV_merged, "KQV_merged", il); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); cb(cur, "KQV_merged_contiguous", il); - // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); @@ -4789,27 +4761,21 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); cb(K, "K", il); - // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); cb(KQ, "KQ", il); - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); cb(KQ_scaled, "KQ_scaled", il); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); cb(KQ_masked, "KQ_masked", il); - // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); cb(KQ_soft_max, "KQ_soft_max", il); - // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_kv, n_embd_head, n_head_kv, @@ -4821,16 +4787,13 @@ static struct ggml_cgraph * llm_build_bloom( struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); cb(KQV, "KQV", il); - // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cb(KQV_merged, "KQV_merged", il); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); cb(cur, "KQV_merged_contiguous", il); } - // Projection cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); cb(cur, "result_wo", il); From f39e6075cf40f6af5befeb43049962af6e6a7c0e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 22:26:36 +0200 Subject: [PATCH 10/18] llama : add llm_build_kqv helper ggml-ci --- llama.cpp | 686 +++++++++++++++++------------------------------------- 1 file changed, 209 insertions(+), 477 deletions(-) diff --git a/llama.cpp b/llama.cpp index e7862e8f528c1..ae4777cf0d8e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3093,6 +3093,103 @@ static bool llama_model_load( using llm_build_cb = std::function; +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + int64_t n_rot, + llm_rope_type type, + const llm_build_cb & cb) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + + const int64_t n_ctx = lctx.cparams.n_ctx; + + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; + + GGML_ASSERT(n_embd_head % n_rot == 0); + + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); + + int rope_type = 0; + + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; + }; + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx, + ggml_view_3d(ctx, kv_self.k, + n_rot, n_head_kv, n_ctx, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(graph, tmp); + } +} + +static void llm_build_kv_store( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -3232,101 +3329,93 @@ static struct ggml_tensor * llm_build_ffn( return cur; } -enum llm_rope_type { - LLM_ROPE, - LLM_ROPE_NEOX, - LLM_ROPE_GLM, -}; - -// Persimmon: n_rot = n_embd_head/2 -// Other: n_rot = n_embd_head -static void llm_build_k_shift( +// if max_alibi_bias > 0 then apply ALiBi +static struct ggml_tensor * llm_build_kqv( const llama_context & lctx, struct ggml_context * ctx, - struct ggml_cgraph * graph, - int64_t n_rot, - llm_rope_type type, - const llm_build_cb & cb) { + struct ggml_tensor * cur, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * kq_scale, + struct ggml_tensor * kq_mask, + int32_t n_tokens, + int32_t n_kv, + float alibi_bias_max, + const llm_build_cb & cb, + int il) { const auto & model = lctx.model; const auto & kv_self = lctx.kv_self; const auto & cparams = lctx.cparams; const auto & hparams = model.hparams; - const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd = hparams.n_embd; + const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const int64_t n_ctx = lctx.cparams.n_ctx; + struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); + cb(q, "q", il); - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; + struct ggml_tensor * k = + ggml_view_3d(ctx, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + cb(k, "k", il); - GGML_ASSERT(n_embd_head % n_rot == 0); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); - int rope_type = 0; + if (alibi_bias_max > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to ggml_add + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max); + cb(kq, "kq_scaled_alibi", il); + } - switch (type) { - case LLM_ROPE: rope_type = 0; break; - case LLM_ROPE_NEOX: rope_type = 2; break; - case LLM_ROPE_GLM: rope_type = 4; break; - }; + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv_self.k, - n_rot, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_rot, rope_type, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(graph, tmp); - } -} + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); -static void llm_build_kv_store( - const llama_context & lctx, - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, - int32_t kv_head, - const llm_build_cb & cb, - int64_t il) { - const auto & model = lctx.model; - const auto & kv_self = lctx.kv_self; - const auto & cparams = lctx.cparams; + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + cb(v, "v", il); - const auto & hparams = model.hparams; + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - // compute the transposed [n_tokens, n_embd] V matrix - struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); - //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed - cb(v_cur_t, "v_cur_t", il); + cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k_cache_view, "k_cache_view", il); + cur = ggml_mul_mat(ctx, wo, cur); + if (wo_b) { + cb(cur, "kqv_wo", il); + } - struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v_cache_view, "v_cache_view", il); + if (wo_b) { + cur = ggml_add(ctx, cur, wo_b); + } - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); + return cur; } static struct ggml_cgraph * llm_build_llama( @@ -3348,7 +3437,6 @@ static struct ggml_cgraph * llm_build_llama( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3440,67 +3528,10 @@ static struct ggml_cgraph * llm_build_llama( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); @@ -3567,7 +3598,6 @@ static struct ggml_cgraph * llm_build_baichaun( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3664,64 +3694,13 @@ static struct ggml_cgraph * llm_build_baichaun( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); + // apply ALiBi for 13B model + const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f; - struct ggml_tensor * KQ_masked; - struct ggml_tensor * KQ_scaled_alibi; - - switch (model.type) { - case MODEL_7B: - KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - break; - case MODEL_13B: - // TODO: replace with ggml_add() - KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); // TODO: n_head or n_head_kv - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - break; - default: - GGML_ASSERT(false); - } - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il); + cb(cur, "kqv_out", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); @@ -3896,48 +3875,10 @@ static struct ggml_cgraph * llm_build_falcon( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, attn_norm, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } struct ggml_tensor * attn_out = cur; @@ -3998,7 +3939,6 @@ static struct ggml_cgraph * llm_build_starcoder( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -4085,50 +4025,12 @@ static struct ggml_cgraph * llm_build_starcoder( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - cb(cur, "result_wo", il); - // Add the input cur = ggml_add(ctx0, cur, inpL); cb(cur, "inpL_+_result_wo", il); @@ -4190,7 +4092,6 @@ static struct ggml_cgraph * llm_build_persimmon( const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_rot = n_embd_head / 2; const float freq_base = cparams.rope_freq_base; @@ -4376,47 +4277,11 @@ static struct ggml_cgraph * llm_build_persimmon( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b", il); + // TODO: not tested, could be broken + cur = llm_build_kqv(lctx, ctx0, Q, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); @@ -4481,7 +4346,6 @@ static struct ggml_cgraph * llm_build_refact( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); const float norm_rms_eps = hparams.f_norm_rms_eps; @@ -4554,53 +4418,10 @@ static struct ggml_cgraph * llm_build_refact( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); @@ -4665,7 +4486,6 @@ static struct ggml_cgraph * llm_build_bloom( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -4750,56 +4570,12 @@ static struct ggml_cgraph * llm_build_bloom( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b", il); - // Add the input cur = ggml_add(ctx0, cur, inpL); cb(cur, "inpL_+_result_wo", il); @@ -4859,7 +4635,6 @@ static struct ggml_cgraph * llm_build_mpt( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -4943,52 +4718,10 @@ static struct ggml_cgraph * llm_build_mpt( llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); } // Add the input @@ -5164,22 +4897,21 @@ static const std::unordered_map k_offload_map { "krotated", OFFLOAD_FUNC_KQ }, { "qrotated", OFFLOAD_FUNC_KQ }, - { "Q", OFFLOAD_FUNC_KQ }, - { "K", OFFLOAD_FUNC_KQ }, - { "KQ", OFFLOAD_FUNC_KQ }, - { "KQ_scaled", OFFLOAD_FUNC_KQ }, - { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, - { "KQ_masked", OFFLOAD_FUNC_KQ }, - { "KQ_soft_max", OFFLOAD_FUNC_V }, - { "V", OFFLOAD_FUNC_V }, - { "KQV", OFFLOAD_FUNC_V }, - { "KQV_merged", OFFLOAD_FUNC_V }, - { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, - - { "result_wo", OFFLOAD_FUNC }, - { "result_wo_b", OFFLOAD_FUNC }, - { "inpL_+_result_wo", OFFLOAD_FUNC }, + { "q", OFFLOAD_FUNC_KQ }, + { "k", OFFLOAD_FUNC_KQ }, + { "kq", OFFLOAD_FUNC_KQ }, + { "kq_scaled", OFFLOAD_FUNC_KQ }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "kq_masked", OFFLOAD_FUNC_KQ }, + { "kq_soft_max", OFFLOAD_FUNC_V }, + { "v", OFFLOAD_FUNC_V }, + { "kqv", OFFLOAD_FUNC_V }, + { "kqv_merged", OFFLOAD_FUNC_V }, + { "kqv_merged_cont", OFFLOAD_FUNC_V }, + { "kqv_wo", OFFLOAD_FUNC_V }, + { "kqv_out", OFFLOAD_FUNC_V }, + { "inpL_+_result_wo", OFFLOAD_FUNC }, { "inpFF", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, From 792d1a1b167a3ff41684652a62ba2e3a1444df9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Oct 2023 11:34:47 +0200 Subject: [PATCH 11/18] llama : minor --- llama.cpp | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/llama.cpp b/llama.cpp index ae4777cf0d8e8..d6b2d7289990a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3503,8 +3503,7 @@ static struct ggml_cgraph * llm_build_llama( // norm cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "attn_norm", il); @@ -3540,8 +3539,7 @@ static struct ggml_cgraph * llm_build_llama( // feed-forward network { cur = llm_build_norm(ctx0, inpFF, - model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -3563,8 +3561,7 @@ static struct ggml_cgraph * llm_build_llama( cur = inpL; cur = llm_build_norm(ctx0, cur, - model.output_norm, - NULL, + model.output_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, -1); cb(cur, "result_norm", -1); @@ -3661,8 +3658,7 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * inpSA = inpL; cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "attn_norm", il); @@ -3709,8 +3705,7 @@ static struct ggml_cgraph * llm_build_baichaun( // feed-forward network { cur = llm_build_norm(ctx0, inpFF, - model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -3732,8 +3727,7 @@ static struct ggml_cgraph * llm_build_baichaun( cur = inpL; cur = llm_build_norm(ctx0, cur, - model.output_norm, - NULL, + model.output_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, -1); cb(cur, "result_norm", -1); @@ -4394,8 +4388,7 @@ static struct ggml_cgraph * llm_build_refact( struct ggml_tensor * inpSA = inpL; cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "attn_norm", il); @@ -4430,8 +4423,7 @@ static struct ggml_cgraph * llm_build_refact( // feed-forward network { cur = llm_build_norm(ctx0, inpFF, - model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -4453,8 +4445,7 @@ static struct ggml_cgraph * llm_build_refact( cur = inpL; cur = llm_build_norm(ctx0, cur, - model.output_norm, - NULL, + model.output_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, -1); cb(cur, "result_norm", -1); From a3f80013adf4837326583c1ff3285cbbe66e2d8d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Oct 2023 12:14:23 +0200 Subject: [PATCH 12/18] llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading --- llama.cpp | 109 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/llama.cpp b/llama.cpp index d6b2d7289990a..a3a4ba6f6b0c5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon( NULL, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cb(cur, "inpL_inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder( cb(KQ_mask, "KQ_mask", -1); pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); + cb(pos, "pos_embd", -1); inpL = ggml_add(ctx0, embd, pos); cb(inpL, "inpL", -1); @@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * inpFF = cur; @@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } inpL = ggml_add(ctx0, cur, inpFF); - + cb(inpL, "inpL_inpFF_ffn_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); inpL = cur; } @@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * inpFF = cur; @@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpFF_+_result_w2", il); + cb(inpL, "inpFF_ffn_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * attn_out = cur; @@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt( NULL, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cb(cur, "inpL_inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -4777,6 +4778,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_OUT, }; +// TODO: will be removed with backend v2 struct llm_offload_trie { struct node { ~node() { @@ -4850,10 +4852,12 @@ struct llm_offload_trie { node * root = nullptr; }; +// TODO: will be removed with backend v2 static const std::unordered_map k_offload_map = { //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel { "inp_pos", OFFLOAD_FUNC_NR }, + { "pos_embd", OFFLOAD_FUNC_NR }, { "KQ_mask", OFFLOAD_FUNC_NR }, { "K_shift", OFFLOAD_FUNC_NR }, @@ -4902,7 +4906,7 @@ static const std::unordered_map k_offload_map { "kqv_wo", OFFLOAD_FUNC_V }, { "kqv_out", OFFLOAD_FUNC_V }, - { "inpL_+_result_wo", OFFLOAD_FUNC }, + { "inpL_kqv_out", OFFLOAD_FUNC }, { "inpFF", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, @@ -4914,15 +4918,15 @@ static const std::unordered_map k_offload_map { "ffn_gate_par", OFFLOAD_FUNC }, { "ffn_down", OFFLOAD_FUNC }, { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_result", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, { "ffn_silu", OFFLOAD_FUNC }, { "ffn_gelu", OFFLOAD_FUNC }, { "ffn_relu", OFFLOAD_FUNC }, { "ffn_sqr(relu)", OFFLOAD_FUNC }, - { "inpFF_+_result_w2", OFFLOAD_FUNC }, - { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, + { "inpFF_ffn_out", OFFLOAD_FUNC }, + { "inpL_inpFF_ffn_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, @@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif + + int n_non_view = 0; // number of non-view tensors that have been processed by the callback + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { @@ -5053,26 +5065,26 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } + // view tensors are not processed further + if (cur->view_src != nullptr) { + return; + } + + if (cur->op != GGML_OP_NONE) { + n_non_view++; + } + // // offload layers // - // TODO: this code will be obsoleted with backend v2 + // TODO: will be removed with backend v2 -#ifdef GGML_USE_CUBLAS - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif +//#define LLAMA_OFFLOAD_DEBUG if (!do_offload) { return; } - // view tensors are not offloaded - if (cur->view_src != nullptr) { - return; - } - const int n_layer = model.hparams.n_layer; const int n_gpu_layers = model.n_gpu_layers; @@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph( llm_offload_func_e func_e = k_offload_func_trie.find(name); if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG // if a tensor hasn't been offloaded, we warn the user if (worst_case) { LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); } +#endif return; } @@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph( // apply offload function to the tensor func(cur); +#ifdef LLAMA_OFFLOAD_DEBUG if (worst_case) { LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); } +#endif }; struct ggml_cgraph * result = NULL; @@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + +#ifdef LLAMA_OFFLOAD_DEBUG + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } +#endif + } + return result; } From 2926ef63b179537627d6722c58cc2d666ac7d30e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 08:23:43 +0200 Subject: [PATCH 13/18] llama : fix input allocation logic --- llama.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index a3a4ba6f6b0c5..75a74c5a473dc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4970,10 +4970,10 @@ static struct ggml_cgraph * llama_build_graph( // allocate input tensors and set input data // - if (batch.token && !alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); @@ -4982,10 +4982,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_tokens = true; } - if (batch.embd && !alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -4995,10 +4995,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_embd = true; } - if (batch.pos && !alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; int32_t * data = (int32_t *) cur->data; From 6669cd8329e443ca4fae635a47da6318767b0f7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 08:24:07 +0200 Subject: [PATCH 14/18] llama : update offload functions for KQ tensors --- llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 75a74c5a473dc..e744fa217b84b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4856,12 +4856,13 @@ struct llm_offload_trie { static const std::unordered_map k_offload_map = { //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "inp_pos", OFFLOAD_FUNC_NR }, { "pos_embd", OFFLOAD_FUNC_NR }, - { "KQ_mask", OFFLOAD_FUNC_NR }, - { "K_shift", OFFLOAD_FUNC_NR }, - { "K_shifted", OFFLOAD_FUNC_NR }, + { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, { "inp_norm", OFFLOAD_FUNC_NR }, { "inp_norm_w", OFFLOAD_FUNC_NR }, From 0bfdcdd0f83ed637179e5ac49a0ddd551763634d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 08:46:34 +0200 Subject: [PATCH 15/18] llama : normalize tensor names ggml-ci --- llama.cpp | 95 ++++++++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/llama.cpp b/llama.cpp index e744fa217b84b..f69af36ec1e4a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3533,12 +3533,12 @@ static struct ggml_cgraph * llm_build_llama( cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -3551,8 +3551,8 @@ static struct ggml_cgraph * llm_build_llama( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3699,12 +3699,12 @@ static struct ggml_cgraph * llm_build_baichaun( cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -3717,8 +3717,8 @@ static struct ggml_cgraph * llm_build_baichaun( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3875,7 +3875,7 @@ static struct ggml_cgraph * llm_build_falcon( cb(cur, "kqv_out", il); } - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = cur; // feed forward { @@ -3887,11 +3887,11 @@ static struct ggml_cgraph * llm_build_falcon( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_inpFF_ffn_out", il); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -4026,15 +4026,13 @@ static struct ggml_cgraph * llm_build_starcoder( cb(cur, "kqv_out", il); } - // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_kqv_out", il); - - struct ggml_tensor * inpFF = cur; + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, norm_eps, cb, il); @@ -4048,8 +4046,8 @@ static struct ggml_cgraph * llm_build_starcoder( cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpL_inpFF_ffn_out", il); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4279,12 +4277,12 @@ static struct ggml_cgraph * llm_build_persimmon( cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, norm_eps, cb, il); @@ -4298,8 +4296,8 @@ static struct ggml_cgraph * llm_build_persimmon( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); inpL = cur; } @@ -4418,12 +4416,12 @@ static struct ggml_cgraph * llm_build_refact( cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, norm_rms_eps, cb, il); cb(cur, "ffn_norm", il); @@ -4436,8 +4434,8 @@ static struct ggml_cgraph * llm_build_refact( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -4569,14 +4567,12 @@ static struct ggml_cgraph * llm_build_bloom( } // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_kqv_out", il); - - struct ggml_tensor * inpFF = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - cur = llm_build_norm(ctx0, inpFF, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, norm_eps, cb, il); @@ -4590,8 +4586,8 @@ static struct ggml_cgraph * llm_build_bloom( cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpFF_ffn_out", il); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4717,14 +4713,12 @@ static struct ggml_cgraph * llm_build_mpt( } // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_kqv_out", il); - - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // feed forward { - cur = llm_build_norm(ctx0, attn_out, + cur = llm_build_norm(ctx0, ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM, norm_eps, cb, il); @@ -4738,8 +4732,8 @@ static struct ggml_cgraph * llm_build_mpt( cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpL_inpFF_ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -4907,9 +4901,7 @@ static const std::unordered_map k_offload_map { "kqv_wo", OFFLOAD_FUNC_V }, { "kqv_out", OFFLOAD_FUNC_V }, - { "inpL_kqv_out", OFFLOAD_FUNC }, - { "inpFF", OFFLOAD_FUNC }, - + { "ffn_inp", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, { "ffn_up", OFFLOAD_FUNC }, @@ -4926,8 +4918,7 @@ static const std::unordered_map k_offload_map { "ffn_relu", OFFLOAD_FUNC }, { "ffn_sqr(relu)", OFFLOAD_FUNC }, - { "inpFF_ffn_out", OFFLOAD_FUNC }, - { "inpL_inpFF_ffn_out", OFFLOAD_FUNC }, + { "l_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, @@ -4960,6 +4951,7 @@ static struct ggml_cgraph * llama_build_graph( int n_non_view = 0; // number of non-view tensors that have been processed by the callback // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + // TODO: will be removed with backend v2 llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -4970,6 +4962,7 @@ static struct ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // + // TODO: will be removed with backend v2 if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { ggml_allocr_alloc(lctx.alloc, cur); From fc5a26aadea54e2bcf6dd384e1ca0c846575bc0c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 08:57:10 +0200 Subject: [PATCH 16/18] llama : enable warning about not offloaded tensors --- llama.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index f69af36ec1e4a..68cb835e1ee21 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5235,7 +5235,6 @@ static struct ggml_cgraph * llama_build_graph( LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); -#ifdef LLAMA_OFFLOAD_DEBUG if (n_non_view != n_non_view_total) { LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); @@ -5244,7 +5243,6 @@ static struct ggml_cgraph * llama_build_graph( LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); } -#endif } return result; From 2073347e3bbbf437c2dff5df22baa1f754468633 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 16:28:09 +0200 Subject: [PATCH 17/18] llama : remove extra ; + deduplicate gate_b logic --- llama.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index 68cb835e1ee21..5fed1c80578e4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3135,7 +3135,7 @@ static void llm_build_k_shift( case LLM_ROPE: rope_type = 0; break; case LLM_ROPE_NEOX: rope_type = 2; break; case LLM_ROPE_GLM: rope_type = 4; break; - }; + } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3207,7 +3207,8 @@ static struct ggml_tensor * llm_build_norm( switch (type) { case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break; case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break; - }; + } + if (mw || mb) { cb(cur, "norm", il); } @@ -3265,23 +3266,18 @@ static struct ggml_tensor * llm_build_ffn( { cur = ggml_mul_mat(ctx, gate, tmp); cb(cur, "ffn_gate", il); - - if (gate_b) { - cur = ggml_add(ctx, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } } break; case LLM_FFN_PAR: { cur = ggml_mul_mat(ctx, gate, cur); cb(cur, "ffn_gate", il); - - if (gate_b) { - cur = ggml_add(ctx, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } } break; - }; + } + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } } else { cur = tmp; } @@ -3310,7 +3306,7 @@ static struct ggml_tensor * llm_build_ffn( cur = ggml_sqr(ctx, cur); cb(cur, "ffn_sqr(relu)", il); } break; - }; + } if (type_gate == LLM_FFN_PAR) { cur = ggml_mul(ctx, cur, tmp); @@ -4098,6 +4094,7 @@ static struct ggml_cgraph * llm_build_persimmon( const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, From 7923b70cb8033ccf4097b7a95013fa6b0ad22ecd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 16:43:08 +0200 Subject: [PATCH 18/18] llama : add llm_build_inp_embd helper --- llama.cpp | 161 +++++++++++++++++------------------------------------- 1 file changed, 50 insertions(+), 111 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5fed1c80578e4..f3db4dc21c11b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1228,8 +1228,8 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; - struct ggml_tensor * tok_embeddings; - struct ggml_tensor * pos_embeddings; + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; struct ggml_tensor * tok_norm; struct ggml_tensor * tok_norm_b; @@ -2484,7 +2484,7 @@ static void llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2552,7 +2552,7 @@ static void llm_load_tensors( } break; case LLM_ARCH_BAICHUAN: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; ggml_backend_type backend_output; @@ -2620,7 +2620,7 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2696,8 +2696,8 @@ static void llm_load_tensors( } break; case LLM_ARCH_STARCODER: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); // output { @@ -2775,7 +2775,7 @@ static void llm_load_tensors( } break; case LLM_ARCH_PERSIMMON: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; @@ -2838,9 +2838,9 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); // output { @@ -2918,7 +2918,7 @@ static void llm_load_tensors( } break; case LLM_ARCH_MPT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -3099,6 +3099,31 @@ enum llm_rope_type { LLM_ROPE_GLM, }; +static struct ggml_tensor * llm_build_inp_embd( + struct ggml_context * ctx, + const llama_batch & batch, + struct ggml_tensor * tok_embd, + int64_t n_embd, + int32_t n_tokens, + const llm_build_cb & cb) { + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens", -1); + + inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + } + + return inpL; +} + // Persimmon: n_rot = n_embd_head/2 // Other: n_rot = n_embd_head static void llm_build_k_shift( @@ -3463,18 +3488,7 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3619,18 +3633,7 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3789,18 +3792,7 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3953,23 +3945,11 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * embd; struct ggml_tensor * pos; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } - cb(embd, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -3983,10 +3963,10 @@ static struct ggml_cgraph * llm_build_starcoder( struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); - inpL = ggml_add(ctx0, embd, pos); + inpL = ggml_add(ctx0, inpL, pos); cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { @@ -4108,14 +4088,7 @@ static struct ggml_cgraph * llm_build_persimmon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "imp_embd", -1); struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -4358,18 +4331,7 @@ static struct ggml_cgraph * llm_build_refact( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // KQ_scale @@ -4499,22 +4461,10 @@ static struct ggml_cgraph * llm_build_bloom( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * embd; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } - cb(embd, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); @@ -4524,7 +4474,7 @@ static struct ggml_cgraph * llm_build_bloom( struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - inpL = llm_build_norm(ctx0, embd, + inpL = llm_build_norm(ctx0, inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, norm_eps, cb, -1); @@ -4648,18 +4598,7 @@ static struct ggml_cgraph * llm_build_mpt( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // KQ_scale