From 7db9c96d8a5f685102eaee72a4b6a3f4f6486fb3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 15:39:58 +0200
Subject: [PATCH 01/18] llama : add llm_build_norm helper function

ggml-ci
---
 llama.cpp | 435 ++++++++++++++++++++++--------------------------------
 1 file changed, 176 insertions(+), 259 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index bad25de4b84a1..c236277d437cc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -972,7 +972,7 @@ struct llama_mlock {
 
 typedef void (*offload_func_t)(struct ggml_tensor * tensor);
 
-static void ggml_offload_nop(struct ggml_tensor * tensor) { // don't offload by default
+static void ggml_offload_nop(struct ggml_tensor * tensor) {
     (void) tensor;
 }
 
@@ -3093,6 +3093,42 @@ static bool llama_model_load(
 
 using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+static struct ggml_tensor * llm_build_norm(
+        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * mw,
+         struct ggml_tensor * mb,
+              llm_norm_type   type,
+                      float   eps,
+         const llm_build_cb & cb,
+                        int   il) {
+    switch (type) {
+        case LLM_NORM:     cur = ggml_norm    (ctx, cur, eps); break;
+        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break;
+    };
+    if (mw || mb) {
+        cb(cur, "norm", il);
+    }
+
+    if (mw) {
+        cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cb(cur, "norm_w", il);
+        }
+    }
+
+    if (mb) {
+        cur = ggml_add(ctx, cur, mb);
+    }
+
+    return cur;
+}
+
 static struct ggml_cgraph * llm_build_llama(
         llama_context  & lctx,
     const llama_batch  & batch,
@@ -3192,14 +3228,11 @@ static struct ggml_cgraph * llm_build_llama(
         struct ggml_tensor * inpSA = inpL;
 
         // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            cb(cur, "rms_norm_0", il);
-
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                NULL,
+                LLM_NORM_RMS, norm_rms_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         // self-attention
         {
@@ -3307,15 +3340,11 @@ static struct ggml_cgraph * llm_build_llama(
 
         // feed-forward network
         {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                cb(cur, "rms_norm_1", il);
-
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    NULL,
+                    LLM_NORM_RMS, norm_rms_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
@@ -3349,15 +3378,11 @@ static struct ggml_cgraph * llm_build_llama(
 
     cur = inpL;
 
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        cb(cur, "rms_norm_2", -1);
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            NULL,
+            LLM_NORM_RMS, norm_rms_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     // lm_head
     cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -3466,15 +3491,11 @@ static struct ggml_cgraph * llm_build_baichaun(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            cb(cur, "rms_norm_0", il);
-
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                NULL,
+                LLM_NORM_RMS, norm_rms_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         // self-attention
         {
@@ -3600,15 +3621,11 @@ static struct ggml_cgraph * llm_build_baichaun(
 
         // feed-forward network
         {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                cb(cur, "rms_norm_1", il);
-
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    NULL,
+                    LLM_NORM_RMS, norm_rms_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
@@ -3763,27 +3780,21 @@ static struct ggml_cgraph * llm_build_falcon(
         struct ggml_tensor * attn_norm;
 
         // self-attention
-        // TODO: refactor into common function (shared with LLaMA)
         {
-            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
-            cb(attn_norm, "attn_norm_0", il);
-
-            attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
-            cb(attn_norm, "attn_norm_0_w", il);
-
-            attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b);
-            cb(attn_norm, "attn_norm_0_wb", il);
-
-            if (model.layers[il].attn_norm_2) { // Falcon-40B
-                cur = ggml_norm(ctx0, inpL, norm_eps);
+            attn_norm = llm_build_norm(ctx0, inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            if (model.layers[il].attn_norm_2) {
+                // Falcon-40B
+                cur = llm_build_norm(ctx0, attn_norm,
+                        model.layers[il].attn_norm_2,
+                        model.layers[il].attn_norm_2_b,
+                        LLM_NORM, norm_eps, cb, il);
                 cb(cur, "attn_norm_2", il);
-
-                cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2);
-                cb(cur, "attn_norm_2_w", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b);
-                cb(cur, "attn_norm_2_wb", il);
-            } else { // Falcon 7B
+            } else {
                 cur = attn_norm;
             }
 
@@ -3925,16 +3936,11 @@ static struct ggml_cgraph * llm_build_falcon(
     cur = inpL;
 
     // norm
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        cb(cur, "out_norm_0", -1);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "out_norm_0_w", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_norm_b);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
@@ -4024,17 +4030,11 @@ static struct ggml_cgraph * llm_build_starcoder(
     cb(inpL, "inpL", -1);
 
     for (int il = 0; il < n_layer; ++il) {
-        {
-            // Norm
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cb(cur, "attn_norm_0", il);
-
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0_w", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
-            cb(cur, "attn_norm_0_wb", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                model.layers[il].attn_norm_b,
+                LLM_NORM, norm_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         {
             // Self Attention
@@ -4130,17 +4130,11 @@ static struct ggml_cgraph * llm_build_starcoder(
 
         // FF
         {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cb(cur, "ffn_norm_0", il);
-
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm_0_w", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b);
-                cb(cur, "ffn_norm_0_wb", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    model.layers[il].ffn_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
             cb(cur, "result_w3", il);
@@ -4161,17 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder(
 
     }
 
-    // Output Norm
-    {
-        cur = ggml_norm(ctx0, inpL, norm_eps);
-        cb(cur, "out_norm_0", -1);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "out_norm_0_w", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_norm_b);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, inpL,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
@@ -4206,7 +4194,7 @@ static struct ggml_cgraph * llm_build_persimmon(
 
     const float freq_base  = cparams.rope_freq_base;
     const float freq_scale = cparams.rope_freq_scale;
-    const float norm_eps = hparams.f_norm_eps;
+    const float norm_eps   = hparams.f_norm_eps;
 
     const int32_t n_tokens    = batch.n_tokens;
     const int32_t n_kv        = worst_case ? n_ctx            : kv_self.n;
@@ -4271,16 +4259,11 @@ static struct ggml_cgraph * llm_build_persimmon(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * residual = inpL;
 
-        {
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cb(cur, "attn_norm_0", il);
-
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0_w", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
-            cb(cur, "attn_norm_0_wb", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                model.layers[il].attn_norm_b,
+                LLM_NORM, norm_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         // self attention
         {
@@ -4316,22 +4299,16 @@ static struct ggml_cgraph * llm_build_persimmon(
             cb(tmpk, "tmpk", il);
 
             // Q/K Layernorm
-            tmpq = ggml_norm(ctx0, tmpq, norm_eps);
-            cb(tmpq, "tmpq", il);
-
-            tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
+            tmpq = llm_build_norm(ctx0, tmpq,
+                    model.layers[il].attn_q_norm,
+                    model.layers[il].attn_q_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
             cb(tmpq, "tmpq", il);
 
-            tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
-            cb(tmpq, "tmpq", il);
-
-            tmpk = ggml_norm(ctx0, tmpk, norm_eps);
-            cb(tmpk, "tmpk", il);
-
-            tmpk =  ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
-            cb(tmpk, "tmpk", il);
-
-            tmpk =  ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
+            tmpk = llm_build_norm(ctx0, tmpk,
+                    model.layers[il].attn_k_norm,
+                    model.layers[il].attn_k_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
             cb(tmpk, "tmpk", il);
 
             // RoPE the first n_rot of q/k, pass the other half, and concat.
@@ -4480,17 +4457,11 @@ static struct ggml_cgraph * llm_build_persimmon(
 
         {
             // MLP
-            {
-                // Norm
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cb(cur, "ffn_norm_0", il);
-
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm_0_w", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b);
-                cb(cur, "ffn_norm_0_wb", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    model.layers[il].ffn_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
             cb(cur, "result_w3", il);
@@ -4519,16 +4490,11 @@ static struct ggml_cgraph * llm_build_persimmon(
 
     cur = inpL;
 
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        cb(cur, "out_norm_0", -1);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "out_norm_0_w", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_norm_b);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
@@ -4609,15 +4575,11 @@ static struct ggml_cgraph * llm_build_refact(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            cb(cur, "rms_norm_0", il);
-
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                NULL,
+                LLM_NORM_RMS, norm_rms_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         // self-attention
         {
@@ -4719,15 +4681,11 @@ static struct ggml_cgraph * llm_build_refact(
 
         // feed-forward network
         {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                cb(cur, "rms_norm_1", il);
-
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    NULL,
+                    LLM_NORM_RMS, norm_rms_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
@@ -4761,15 +4719,11 @@ static struct ggml_cgraph * llm_build_refact(
 
     cur = inpL;
 
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        cb(cur, "rms_norm_2", -1);
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            NULL,
+            LLM_NORM_RMS, norm_rms_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     // lm_head
     cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -4851,30 +4805,18 @@ static struct ggml_cgraph * llm_build_bloom(
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     cb(KQ_mask, "KQ_mask", -1);
 
-    // norm
-    {
-        inpL = ggml_norm(ctx0, embd, norm_eps);
-        cb(inpL, "inp_norm", -1);
-
-        inpL = ggml_mul(ctx0, inpL, model.tok_norm);
-        cb(inpL, "inp_norm_w", -1);
-
-        inpL = ggml_add (ctx0, inpL, model.tok_norm_b);
-        cb(inpL, "inp_norm_wb", -1);
-    }
+    inpL = llm_build_norm(ctx0, embd,
+            model.tok_norm,
+            model.tok_norm_b,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(inpL, "inp_norm", -1);
 
     for (int il = 0; il < n_layer; ++il) {
-        {
-            // Norm
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cb(cur, "attn_norm_0", il);
-
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            cb(cur, "attn_norm_0_w", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
-            cb(cur, "attn_norm_0_wb", il);
-        }
+        cur = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                model.layers[il].attn_norm_b,
+                LLM_NORM, norm_eps, cb, il);
+        cb(cur, "attn_norm", il);
 
         {
             // Self Attention
@@ -4984,17 +4926,11 @@ static struct ggml_cgraph * llm_build_bloom(
 
         // FF
         {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cb(cur, "ffn_norm_0", il);
-
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm_0_w", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b);
-                cb(cur, "ffn_norm_0_wb", il);
-            }
+            cur = llm_build_norm(ctx0, inpFF,
+                    model.layers[il].ffn_norm,
+                    model.layers[il].ffn_norm_b,
+                    LLM_NORM, norm_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
             cb(cur, "result_w3", il);
@@ -5016,17 +4952,11 @@ static struct ggml_cgraph * llm_build_bloom(
         cb(inpL, "inpFF_+_result_w2", il);
     }
 
-    // Output Norm
-    {
-        cur = ggml_norm(ctx0, inpL, norm_eps);
-        cb(cur, "out_norm_0", -1);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "out_norm_0_w", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_norm_b);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, inpL,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
@@ -5109,18 +5039,15 @@ static struct ggml_cgraph * llm_build_mpt(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * attn_norm;
 
+        attn_norm = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                NULL,
+                LLM_NORM, norm_eps, cb, il);
+        cb(attn_norm, "attn_norm", il);
+
         // self-attention
-        // TODO: refactor into common function (shared with LLaMA)
         {
-            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
-            cb(attn_norm, "attn_norm_0", il);
-
-            attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
-            cb(attn_norm, "attn_norm_0_w", il);
-
-            if (1) {
-                cur = attn_norm;
-            }
+            cur = attn_norm;
 
             // compute QKV
 
@@ -5230,14 +5157,11 @@ static struct ggml_cgraph * llm_build_mpt(
 
         // feed forward
         {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, attn_out, norm_eps);
-                cb(cur, "ffn_norm_0", il);
-
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                cb(cur, "ffn_norm_0_w", il);
-            }
+            cur = llm_build_norm(ctx0, attn_out,
+                    model.layers[il].ffn_norm,
+                    NULL,
+                    LLM_NORM, norm_eps, cb, il);
+            cb(cur, "ffn_norm", il);
 
             cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
             cb(cur, "result_w3", il);
@@ -5258,14 +5182,11 @@ static struct ggml_cgraph * llm_build_mpt(
 
     cur = inpL;
 
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        cb(cur, "out_norm_0", -1);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            NULL,
+            LLM_NORM, norm_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
@@ -5378,15 +5299,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
     { "inp_norm_wb",                OFFLOAD_FUNC_NR  },
 
-    { "rms_norm_0",                 OFFLOAD_FUNC     },
-
-    { "attn_norm_0",                OFFLOAD_FUNC     },
-    { "attn_norm_0_w",              OFFLOAD_FUNC     },
-    { "attn_norm_0_wb",             OFFLOAD_FUNC     },
+    { "norm",                       OFFLOAD_FUNC     },
+    { "norm_w",                     OFFLOAD_FUNC     },
+    { "norm_wb",                    OFFLOAD_FUNC     },
 
+    { "attn_norm",                  OFFLOAD_FUNC     },
     { "attn_norm_2",                OFFLOAD_FUNC     },
-    { "attn_norm_2_w",              OFFLOAD_FUNC     },
-    { "attn_norm_2_wb",             OFFLOAD_FUNC     },
 
     { "wqkv",                       OFFLOAD_FUNC_KQ  },
     { "bqkv",                       OFFLOAD_FUNC_KQ  },
@@ -5614,20 +5532,19 @@ static struct ggml_cgraph * llama_build_graph(
 
         static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
             { OFFLOAD_FUNC_NOP, "CPU" },
+            { OFFLOAD_FUNC_OUT, "CPU" },
 #ifdef GGML_USE_CUBLAS
             { OFFLOAD_FUNC,     "GPU (CUDA)" },
             { OFFLOAD_FUNC_KQ,  "GPU (CUDA) KQ" },
             { OFFLOAD_FUNC_V,   "GPU (CUDA) V" },
             { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR" },
             { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
-            { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" },
 #else
             { OFFLOAD_FUNC,     "CPU" },
             { OFFLOAD_FUNC_KQ,  "CPU" },
             { OFFLOAD_FUNC_V,   "CPU" },
             { OFFLOAD_FUNC_NR,  "CPU" },
             { OFFLOAD_FUNC_EMB, "CPU" },
-            { OFFLOAD_FUNC_OUT, "CPU" },
 #endif // GGML_USE_CUBLAS
         };
 

From dbf836bb641c499ad898d8b90d71b7b7aee72d61 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 18:47:46 +0200
Subject: [PATCH 02/18] llama : add llm_build_ffn helper function (#3849)

ggml-ci
---
 llama.cpp | 376 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 200 insertions(+), 176 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c236277d437cc..cf3ee494af465 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1116,13 +1116,13 @@ struct llama_layer {
     struct ggml_tensor * ffn_norm_b;
 
     // ff
-    struct ggml_tensor * w1; // ffn_gate
-    struct ggml_tensor * w2; // ffn_down
-    struct ggml_tensor * w3; // ffn_up
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
 
     // ff bias
-    struct ggml_tensor * b2; // ffn_down
-    struct ggml_tensor * b3; // ffn_up
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
 };
 
 struct llama_kv_cell {
@@ -2538,15 +2538,15 @@ static void llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2604,15 +2604,15 @@ static void llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2683,14 +2683,14 @@ static void llm_load_tensors(
                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2756,11 +2756,11 @@ static void llm_load_tensors(
                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
 
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2768,8 +2768,8 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
                                 ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
                                 ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2)          +
-                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3);
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b)  +
+                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b);
                         }
                     }
                 } break;
@@ -2816,22 +2816,22 @@ static void llm_load_tensors(
                         const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
                         const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
                         auto & layer = model.layers[i];
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
+                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend_split);
+                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
+                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend_split);
+                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend_split);
+                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend_split);
+                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
                         layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),   {64}, backend);
+                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64}, backend);
                         layer.attn_k_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64}, backend);
                     }
                 } break;
             case LLM_ARCH_BLOOM:
@@ -2899,11 +2899,11 @@ static void llm_load_tensors(
                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
 
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
+                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2911,8 +2911,8 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
                                 ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
                                 ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3)          +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2);
+                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b)    +
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b);
                         }
                     }
                 } break;
@@ -2969,8 +2969,8 @@ static void llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2978,8 +2978,8 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      +
                                 ggml_nbytes(layer.wo)        +
                                 ggml_nbytes(layer.ffn_norm)  +
-                                ggml_nbytes(layer.w2)        +
-                                ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.ffn_down)  +
+                                ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -3129,6 +3129,107 @@ static struct ggml_tensor * llm_build_norm(
     return cur;
 }
 
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+static struct ggml_tensor * llm_build_ffn(
+        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * up,
+         struct ggml_tensor * up_b,
+         struct ggml_tensor * gate,
+         struct ggml_tensor * gate_b,
+         struct ggml_tensor * down,
+         struct ggml_tensor * down_b,
+            llm_ffn_op_type   type_op,
+          llm_ffn_gate_type   type_gate,
+         const llm_build_cb & cb,
+                        int   il) {
+    struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
+    cb(tmp, "ffn_up", il);
+
+    if (up_b) {
+        tmp = ggml_add(ctx, tmp, up_b);
+        cb(tmp, "ffn_up_b", il);
+    }
+
+    if (gate) {
+        switch (type_gate) {
+            case LLM_FFN_SEQ:
+                {
+                    cur = ggml_mul_mat(ctx, gate, tmp);
+                    cb(cur, "ffn_gate", il);
+
+                    if (gate_b) {
+                        cur = ggml_add(ctx, cur, gate_b);
+                        cb(cur, "ffn_gate_b", il);
+                    }
+                } break;
+            case LLM_FFN_PAR:
+                {
+                    cur = ggml_mul_mat(ctx, gate, cur);
+                    cb(cur, "ffn_gate", il);
+
+                    if (gate_b) {
+                        cur = ggml_add(ctx, cur, gate_b);
+                        cb(cur, "ffn_gate_b", il);
+                    }
+                } break;
+        };
+    }
+
+    switch (type_op) {
+        case LLM_FFN_SILU:
+            {
+                cur = ggml_silu(ctx, cur);
+                cb(cur, "ffn_silu", il);
+            } break;
+        case LLM_FFN_GELU:
+            {
+                cur = ggml_gelu(ctx, cur);
+                cb(cur, "ffn_gelu", il);
+            } break;
+        case LLM_FFN_RELU:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
+            } break;
+        case LLM_FFN_RELU_SQR:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
+
+                cur = ggml_sqr(ctx, cur);
+                cb(cur, "ffn_sqr(relu)", il);
+            } break;
+    };
+
+    if (type_gate == LLM_FFN_PAR) {
+        cur = ggml_mul(ctx, cur, tmp);
+        cb(cur, "ffn_gate_par", il);
+    }
+
+    cur = ggml_mul_mat(ctx, down, cur);
+    if (down_b) {
+        cb(cur, "ffn_down", il);
+    }
+
+    if (down_b) {
+        cur = ggml_add(ctx, cur, down_b);
+    }
+
+    return cur;
+}
+
 static struct ggml_cgraph * llm_build_llama(
         llama_context  & lctx,
     const llama_batch  & batch,
@@ -3346,27 +3447,12 @@ static struct ggml_cgraph * llm_build_llama(
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            cb(tmp, "result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            cb(cur, "result_w1", il);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            cb(cur, "silu", il);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            cb(cur, "silu_x_result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            cb(cur, "result_w2", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL,
+                    model.layers[il].ffn_gate, NULL,
+                    model.layers[il].ffn_down, NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
@@ -3627,27 +3713,12 @@ static struct ggml_cgraph * llm_build_baichaun(
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            cb(tmp, "result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            cb(cur, "result_w1", il);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            cb(cur, "silu", il);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            cb(cur, "silu_x_result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            cb(cur, "result_w2", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL,
+                    model.layers[il].ffn_gate, NULL,
+                    model.layers[il].ffn_down, NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
@@ -3911,16 +3982,12 @@ static struct ggml_cgraph * llm_build_falcon(
 
         // feed forward
         {
-            struct ggml_tensor * inpFF = attn_norm;
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
-            cb(cur, "result_w3", il);
-
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "gelu", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            cb(cur, "result_w2", il);
+            cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
+                    model.layers[il].ffn_up,   NULL,
+                    NULL,                      NULL,
+                    model.layers[il].ffn_down, NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, attn_out);
@@ -4136,19 +4203,12 @@ static struct ggml_cgraph * llm_build_starcoder(
                     LLM_NORM, norm_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
-            cb(cur, "result_w3", il);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "gelu", il);
-
-            // Projection
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            cb(cur, "result_w2", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].b2);
-            cb(cur, "result_w2_b", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                    NULL,                      NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         inpL = ggml_add(ctx0, cur, inpFF);
@@ -4455,31 +4515,20 @@ static struct ggml_cgraph * llm_build_persimmon(
         struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
         cb(inpFF, "inpFF", il);
 
+        // feed-forward network
         {
-            // MLP
             cur = llm_build_norm(ctx0, inpFF,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
                     LLM_NORM, norm_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            cb(cur, "result_w3", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].b3);
-            cb(cur, "result_w3_b", il);
-
-            cur = ggml_relu(ctx0, cur);
-            cb(cur, "relu", il);
-
-            cur = ggml_sqr(ctx0, cur);
-            cb(cur, "sqr(relu)", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            cb(cur, "result_w2", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].b2);
-            cb(cur, "result_w2_b", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                    NULL,                      NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
@@ -4687,27 +4736,12 @@ static struct ggml_cgraph * llm_build_refact(
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            cb(tmp, "result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            cb(cur, "result_w1", il);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            cb(cur, "silu", il);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            cb(cur, "silu_x_result_w3", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            cb(cur, "result_w2", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL,
+                    model.layers[il].ffn_gate, NULL,
+                    model.layers[il].ffn_down, NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
@@ -4932,20 +4966,12 @@ static struct ggml_cgraph * llm_build_bloom(
                     LLM_NORM, norm_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            cb(cur, "result_w3", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].b3);
-            cb(cur, "result_w3_b", il);
-
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "gelu", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            cb(cur, "result_w2", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].b2);
-            cb(cur, "result_w2_b", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                    NULL,                      NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         inpL = ggml_add(ctx0, cur, inpFF);
@@ -5163,14 +5189,12 @@ static struct ggml_cgraph * llm_build_mpt(
                     LLM_NORM, norm_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            cb(cur, "result_w3", il);
-
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "gelu", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            cb(cur, "result_w2", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL,
+                    NULL,                      NULL,
+                    model.layers[il].ffn_down, NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_result", il);
         }
 
         cur = ggml_add(ctx0, cur, attn_out);

From 38728a0be0efcb603726fc78a1356bdc3aec910b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 19:22:54 +0200
Subject: [PATCH 03/18] llama : add llm_build_k_shift helper

ggml-ci
---
 llama.cpp | 130 +++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 64 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cf3ee494af465..b746cc14f2d2f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3230,6 +3230,65 @@ static struct ggml_tensor * llm_build_ffn(
     return cur;
 }
 
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+// Persimmon: n_rot = n_embd_head/2
+// Other:     n_rot = n_embd_head
+static void llm_build_k_shift(
+        const llama_context & lctx,
+        struct ggml_context * ctx,
+         struct ggml_cgraph * graph,
+                    int64_t   n_rot,
+              llm_rope_type   type,
+         const llm_build_cb & cb) {
+    const auto & model   = lctx.model;
+    const auto & kv_self = lctx.kv_self;
+    const auto & cparams = lctx.cparams;
+
+    const auto & hparams = model.hparams;
+
+    const int64_t n_head      = hparams.n_head;
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int64_t n_embd_head = hparams.n_embd_head();
+
+    const int64_t n_ctx = lctx.cparams.n_ctx;
+
+    const float freq_base  = cparams.rope_freq_base;
+    const float freq_scale = cparams.rope_freq_scale;
+
+    GGML_ASSERT(n_embd_head % n_rot == 0);
+
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift, "K_shift", -1);
+
+    int rope_type = 0;
+
+    switch (type) {
+        case LLM_ROPE:      rope_type = 0; break;
+        case LLM_ROPE_NEOX: rope_type = 2; break;
+        case LLM_ROPE_GLM:  rope_type = 4; break;
+    };
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * tmp =
+            // we rotate only the first n_rot dimensions
+            ggml_rope_custom_inplace(ctx,
+                    ggml_view_3d(ctx, kv_self.k,
+                        n_rot, n_head, n_ctx,
+                        ggml_element_size(kv_self.k)*n_embd_head,
+                        ggml_element_size(kv_self.k)*n_embd_gqa,
+                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                    K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
+        cb(tmp, "K_shifted", il);
+        ggml_build_forward_expand(graph, tmp);
+    }
+}
+
 static struct ggml_cgraph * llm_build_llama(
         llama_context  & lctx,
     const llama_batch  & batch,
@@ -3308,21 +3367,7 @@ static struct ggml_cgraph * llm_build_llama(
 
     // shift the entire K-cache if needed
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(K_shift, "K_shift", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
-            cb(tmp, "K_shifted", il);
-            ggml_build_forward_expand(gf, tmp);
-        }
+        llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb);
     }
 
     for (int il = 0; il < n_layer; ++il) {
@@ -3557,21 +3602,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     // shift the entire K-cache if needed
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(K_shift, "K_shift", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
-            cb(tmp, "K_shifted", il);
-            ggml_build_forward_expand(gf, tmp);
-        }
+        llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb);
     }
 
     for (int il = 0; il < n_layer; ++il) {
@@ -3830,21 +3861,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
     // shift the entire K-cache if needed
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(K_shift, "K_shift", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
-            cb(tmp, "K_shifted", il);
-            ggml_build_forward_expand(gf, tmp);
-        }
+        llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb);
     }
 
     for (int il = 0; il < n_layer; ++il) {
@@ -4243,6 +4260,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     GGML_ASSERT(!!kv_self.ctx);
 
     const auto & cparams = lctx.cparams;
+
     const int64_t n_embd      = hparams.n_embd;
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
@@ -4250,7 +4268,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     const int64_t n_head      = hparams.n_head;
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-    const size_t n_rot        = n_embd_head / 2;
+    const int64_t n_rot       = n_embd_head / 2;
 
     const float freq_base  = cparams.rope_freq_base;
     const float freq_scale = cparams.rope_freq_scale;
@@ -4297,23 +4315,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     cb(KQ_mask, "KQ_mask", -1);
 
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(K_shift, "K_shift", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    // we rotate only the first n_rot dimensions.
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_rot, n_head, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
-                        ),
-                        K_shift, n_rot, 2, 0, freq_base, freq_scale);
-            cb(tmp, "K_shifted", il);
-            ggml_build_forward_expand(gf, tmp);
-        }
+        llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb);
     }
 
     for (int il = 0; il < n_layer; ++il) {
@@ -5534,7 +5536,7 @@ static struct ggml_cgraph * llama_build_graph(
 #ifdef GGML_USE_CUBLAS
         const bool do_offload = true;
 #else
-        const bool do_offload = false;
+        const bool do_offload = true; // TODO: set to false after finishing refactoring
 #endif
 
         if (!do_offload) {

From 909d64471bb4cc26ba1a3e7db8361f4796d874d6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 19:45:27 +0200
Subject: [PATCH 04/18] llama : fix offloading after recent changes

---
 llama.cpp | 52 ++++++++++++++++++++++------------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b746cc14f2d2f..92ef6e4ea5208 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3185,6 +3185,8 @@ static struct ggml_tensor * llm_build_ffn(
                     }
                 } break;
         };
+    } else {
+        cur = tmp;
     }
 
     switch (type_op) {
@@ -3761,15 +3763,11 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     cur = inpL;
 
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        cb(cur, "rms_norm_2", -1);
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        cb(cur, "result_norm", -1);
-    }
+    cur = llm_build_norm(ctx0, cur,
+            model.output_norm,
+            NULL,
+            LLM_NORM_RMS, norm_rms_eps, cb, -1);
+    cb(cur, "result_norm", -1);
 
     // lm_head
     cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -5374,31 +5372,25 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
 
     { "inpFF",                      OFFLOAD_FUNC     },
 
-    { "rms_norm_1",                 OFFLOAD_FUNC     },
     { "ffn_norm",                   OFFLOAD_FUNC     },
-    { "ffn_norm_0",                 OFFLOAD_FUNC     },
-    { "ffn_norm_0_w",               OFFLOAD_FUNC     },
-    { "ffn_norm_0_wb",              OFFLOAD_FUNC     },
-
-    { "result_w3",                  OFFLOAD_FUNC     },
-    { "result_w3_b",                OFFLOAD_FUNC     },
-    { "result_w2",                  OFFLOAD_FUNC     },
-    { "result_w2_b",                OFFLOAD_FUNC     },
-    { "result_w1",                  OFFLOAD_FUNC     },
-
-    { "silu",                       OFFLOAD_FUNC     },
-    { "gelu",                       OFFLOAD_FUNC     },
-    { "relu",                       OFFLOAD_FUNC     },
-    { "sqr(relu)",                  OFFLOAD_FUNC     },
-
-    { "silu_x_result_w3",           OFFLOAD_FUNC     },
+
+    { "ffn_up",                     OFFLOAD_FUNC     },
+    { "ffn_up_b",                   OFFLOAD_FUNC     },
+    { "ffn_gate",                   OFFLOAD_FUNC     },
+    { "ffn_gate_b",                 OFFLOAD_FUNC     },
+    { "ffn_gate_par",               OFFLOAD_FUNC     },
+    { "ffn_down",                   OFFLOAD_FUNC     },
+    { "ffn_down_b",                 OFFLOAD_FUNC     },
+    { "ffn_result",                 OFFLOAD_FUNC     },
+
+    { "ffn_silu",                   OFFLOAD_FUNC     },
+    { "ffn_gelu",                   OFFLOAD_FUNC     },
+    { "ffn_relu",                   OFFLOAD_FUNC     },
+    { "ffn_sqr(relu)",              OFFLOAD_FUNC     },
+
     { "inpFF_+_result_w2",          OFFLOAD_FUNC     },
     { "inpL_+_inpFF_+_result_w2",   OFFLOAD_FUNC     },
 
-    { "rms_norm_2",                 OFFLOAD_FUNC_NR  },
-    { "out_norm_0",                 OFFLOAD_FUNC_NR  },
-    { "out_norm_0_w",               OFFLOAD_FUNC_NR  },
-
     { "result_norm",                OFFLOAD_FUNC_EMB },
     { "result_output",              OFFLOAD_FUNC_OUT },
 };

From 3e0462594b2cb687eefd8099116e5c126691bf60 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 20:35:20 +0200
Subject: [PATCH 05/18] llama : add llm_build_kv_store helper

ggml-ci
---
 llama.cpp | 362 ++++++++++++++++++------------------------------------
 1 file changed, 119 insertions(+), 243 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 92ef6e4ea5208..0d6c87f04e3e5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3291,6 +3291,44 @@ static void llm_build_k_shift(
     }
 }
 
+static void llm_build_kv_store(
+        const llama_context & lctx,
+        struct ggml_context * ctx,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+         const llm_build_cb & cb,
+                    int64_t   il) {
+    const auto & model   = lctx.model;
+    const auto & kv_self = lctx.kv_self;
+    const auto & cparams = lctx.cparams;
+
+    const auto & hparams = model.hparams;
+
+    const int64_t n_ctx      = cparams.n_ctx;
+    const int64_t n_embd_gqa = hparams.n_embd_gqa();
+
+    // compute the transposed [n_tokens, n_embd] V matrix
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
+    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+    cb(v_cur_t, "v_cur_t", il);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa,
+                (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+    cb(k_cache_view, "k_cache_view", il);
+
+    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa,
+            (   n_ctx)*ggml_element_size(kv_self.v),
+            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+    cb(v_cache_view, "v_cache_view", il);
+
+    // important: storing RoPE-ed version of K in the KV cache!
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+}
+
 static struct ggml_cgraph * llm_build_llama(
         llama_context  & lctx,
     const llama_batch  & batch,
@@ -3385,40 +3423,22 @@ static struct ggml_cgraph * llm_build_llama(
         // self-attention
         {
             // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            cb(tmpk, "tmpk", il);
-
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            cb(tmpq, "tmpq", il);
-
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-            cb(Kcur, "Kcur", il);
-
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
             cb(Qcur, "Qcur", il);
 
-            // store key and value to memory
-            {
-                // compute the transposed [n_tokens, n_embd] V matrix
-
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(tmpv, "tmpv", il);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
+            Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
+            Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            cb(Kcur, "Kcur", il);
 
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
@@ -3619,53 +3639,31 @@ static struct ggml_cgraph * llm_build_baichaun(
         // self-attention
         {
             // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            cb(tmpk, "tmpk", il);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
 
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            cb(tmpq, "tmpq", il);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
 
-            struct ggml_tensor * Kcur;
-            struct ggml_tensor * Qcur;
             switch (model.type) {
                 case MODEL_7B:
-                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens),    inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens),    inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                     break;
                 case MODEL_13B:
-                    Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
-                    Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
                     break;
                 default:
                     GGML_ASSERT(false);
             }
-
-            cb(Kcur, "Kcur", il);
-
             cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
 
-            // store key and value to memory
-            {
-                // compute the transposed [n_tokens, n_embd] V matrix
-
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(tmpv, "tmpv", il);
-
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
-
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
@@ -3865,14 +3863,14 @@ static struct ggml_cgraph * llm_build_falcon(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * attn_norm;
 
+        attn_norm = llm_build_norm(ctx0, inpL,
+                model.layers[il].attn_norm,
+                model.layers[il].attn_norm_b,
+                LLM_NORM, norm_eps, cb, il);
+        cb(attn_norm, "attn_norm", il);
+
         // self-attention
         {
-            attn_norm = llm_build_norm(ctx0, inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, norm_eps, cb, il);
-            cb(attn_norm, "attn_norm", il);
-
             if (model.layers[il].attn_norm_2) {
                 // Falcon-40B
                 cur = llm_build_norm(ctx0, attn_norm,
@@ -3885,7 +3883,6 @@ static struct ggml_cgraph * llm_build_falcon(
             }
 
             // compute QKV
-
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
@@ -3902,52 +3899,35 @@ static struct ggml_cgraph * llm_build_falcon(
 
             // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
             //       non-contiguous views is added for the rope operator
-            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(
                 ctx0, cur, n_embd_head, n_head, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
                 0));
-            cb(tmpq, "tmpq", il);
+            cb(Qcur, "Qcur", il);
 
-            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(
                 ctx0, cur, n_embd_head, n_head_kv, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
                 wsize * n_embd_head *  n_head));
-            cb(tmpk, "tmpk", il);
+            cb(Kcur, "Kcur", il);
 
-            struct ggml_tensor * tmpv = ggml_view_3d(
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(
                 ctx0, cur, n_embd_head, n_head_kv, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv));
-            cb(tmpv, "tmpv", il);
+                wsize * n_embd_head * (n_head +     n_head_kv)));
+            cb(Vcur, "Vcur", il);
 
             // using mode = 2 for neox mode
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
+            Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
             cb(Qcur, "Qcur", il);
 
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
+            Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
             cb(Kcur, "Kcur", il);
 
-            {
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv);
-                cb(Vcur, "Vcur_0", il);
-
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur_1", il);
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
@@ -4118,40 +4098,25 @@ static struct ggml_cgraph * llm_build_starcoder(
                 LLM_NORM, norm_eps, cb, il);
         cb(cur, "attn_norm", il);
 
+        // self-attention
         {
-            // Self Attention
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
             cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
             cb(cur, "bqkv", il);
 
-            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-            cb(tmpq, "tmpq", il);
-            cb(tmpk, "tmpk", il);
-            cb(tmpv, "tmpv", il);
-
-            struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
-            struct ggml_tensor * Kcur = tmpk;
-
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
-                cb(Vcur, "Vcur", il);
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
@@ -4441,34 +4406,16 @@ static struct ggml_cgraph * llm_build_persimmon(
             Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
             cb(Kcur, "Kcur", il);
 
-            {
-                struct ggml_tensor * tmpv = ggml_view_3d(
-                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
+            struct ggml_tensor * Vcur = ggml_view_3d(
+                    ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+                    ggml_element_size(tmpqkv_perm) * n_embd_head,
+                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
                     );
-                cb(tmpv, "tmpv", il);
+            cb(Vcur, "Vcur", il);
 
-                // store K, V in cache
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-                struct ggml_tensor * k = ggml_view_1d(
-                    ctx0, kv_self.k, n_tokens*n_embd_gqa,
-                    (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
-                );
-                cb(k, "k", il);
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
-
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
             struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
                     n_embd_head, n_kv, n_head_kv,
                     ggml_element_size(kv_self.k)*n_embd_gqa,
@@ -4632,40 +4579,22 @@ static struct ggml_cgraph * llm_build_refact(
 
         // self-attention
         {
-            // compute Q and K
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            cb(tmpk, "tmpk", il);
-
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            cb(tmpq, "tmpq", il);
-
-            struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
-            cb(Kcur, "Kcur", il);
-
-            struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
             cb(Qcur, "Qcur", il);
 
-            // store key and value to memory
-            {
-                // compute the transposed [n_tokens, n_embd] V matrix
-
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(tmpv, "tmpv", il);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            cb(Qcur, "Qcur", il);
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
@@ -4852,48 +4781,27 @@ static struct ggml_cgraph * llm_build_bloom(
                 LLM_NORM, norm_eps, cb, il);
         cb(cur, "attn_norm", il);
 
+        // self-attention
         {
-            // Self Attention
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
             cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
             cb(cur, "bqkv", il);
 
-            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-            struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-            cb(tmpq, "tmpq", il);
-            cb(tmpk, "tmpk", il);
-            cb(tmpv, "tmpv", il);
-
-            struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
-            struct ggml_tensor * Kcur = tmpk;
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
-                        0, 2, 1, 3);
+            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);
 
             struct ggml_tensor * K =
@@ -5075,8 +4983,6 @@ static struct ggml_cgraph * llm_build_mpt(
         {
             cur = attn_norm;
 
-            // compute QKV
-
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
@@ -5085,47 +4991,17 @@ static struct ggml_cgraph * llm_build_mpt(
                 cb(cur, "wqkv_clamped", il);
             }
 
-            const size_t wsize = ggml_type_size(cur->type);
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-            struct ggml_tensor * Qcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
             cb(Qcur, "Qcur", il);
-
-            struct ggml_tensor * Kcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
             cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
 
-            struct ggml_tensor * tmpv = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv));
-            cb(tmpv, "tmpv", il);
-
-            {
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv);
-                cb(Vcur, "Vcur", il);
-
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens));
-                cb(Vcur, "Vcur", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                cb(k, "k", il);
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                cb(v, "v", il);
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+            llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             cb(Q, "Q", il);

From 59908619386e8ce5f66fec5d76122a3b316e3861 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 21:11:20 +0200
Subject: [PATCH 06/18] llama : remove obsolete offload names

---
 llama.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0d6c87f04e3e5..abc1cff167cd4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5213,12 +5213,9 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "tmpk",                       OFFLOAD_FUNC_KQ  },
     { "tmpq",                       OFFLOAD_FUNC_KQ  },
     { "tmpv",                       OFFLOAD_FUNC_V   },
-    { "tmpkqv",                     OFFLOAD_FUNC_KQ  }, // ??
     { "Kcur",                       OFFLOAD_FUNC_KQ  },
     { "Qcur",                       OFFLOAD_FUNC_KQ  },
     { "Vcur",                       OFFLOAD_FUNC_V   },
-    { "Vcur_0",                     OFFLOAD_FUNC_V   },
-    { "Vcur_1",                     OFFLOAD_FUNC_V   },
 
     { "krot",                       OFFLOAD_FUNC_KQ  },
     { "qrot",                       OFFLOAD_FUNC_KQ  },
@@ -5227,9 +5224,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "krotated",                   OFFLOAD_FUNC_KQ  },
     { "qrotated",                   OFFLOAD_FUNC_KQ  },
 
-    { "k",                          OFFLOAD_FUNC_KQ  },
-    { "v",                          OFFLOAD_FUNC_V   },
-
     { "Q",                          OFFLOAD_FUNC_KQ  },
     { "K",                          OFFLOAD_FUNC_KQ  },
     { "KQ",                         OFFLOAD_FUNC_KQ  },

From 31a12f3d03c3ac493b8dc35e70b9a85d7fbd6135 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 21:17:46 +0200
Subject: [PATCH 07/18] llama : fix llm_build_k_shift to use n_head_kv instead
 of n_head

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index abc1cff167cd4..db5fe067c2911 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3253,8 +3253,8 @@ static void llm_build_k_shift(
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_head      = hparams.n_head;
     const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
     const int64_t n_embd_head = hparams.n_embd_head();
 
@@ -3281,7 +3281,7 @@ static void llm_build_k_shift(
             // we rotate only the first n_rot dimensions
             ggml_rope_custom_inplace(ctx,
                     ggml_view_3d(ctx, kv_self.k,
-                        n_rot, n_head, n_ctx,
+                        n_rot, n_head_kv, n_ctx,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),

From a104abea4884ae6d84eb4f861c715ea94deca394 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 21:24:25 +0200
Subject: [PATCH 08/18] llama : simplify falcon Q, K, V computation

---
 llama.cpp | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index db5fe067c2911..9ab85c90355e2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3886,40 +3886,17 @@ static struct ggml_cgraph * llm_build_falcon(
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
-            // Note that the strides for Kcur, Vcur are set up so that the
-            // resulting views are misaligned with the tensor's storage
-            // (by applying the K/V offset we shift the tensor's original
-            // view to stick out behind the viewed QKV tensor's allocated
-            // memory, so to say). This is ok because no actual accesses
-            // happen to that out-of-range memory, but it can require some
-            // trickery when trying to accurately dump these views for
-            // debugging.
-
-            const size_t wsize = ggml_type_size(cur->type);
-
-            // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
-            //       non-contiguous views is added for the rope operator
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0));
-            cb(Qcur, "Qcur", il);
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head));
+            cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
-
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv)));
             cb(Vcur, "Vcur", il);
 
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
             // using mode = 2 for neox mode
             Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
             cb(Qcur, "Qcur", il);

From c9121fdd0fda14067349ba3bcea10e696537723f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 21:44:19 +0200
Subject: [PATCH 09/18] llama : remove obsolete comments in build graphs

---
 llama.cpp | 39 +--------------------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 9ab85c90355e2..e7862e8f528c1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3638,7 +3638,6 @@ static struct ggml_cgraph * llm_build_baichaun(
 
         // self-attention
         {
-            // compute Q and K and RoPE them
             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
             cb(Qcur, "Qcur", il);
 
@@ -3676,12 +3675,9 @@ static struct ggml_cgraph * llm_build_baichaun(
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
             cb(K, "K", il);
 
-            // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             cb(KQ, "KQ", il);
 
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             cb(KQ_scaled, "KQ_scaled", il);
 
@@ -3694,7 +3690,7 @@ static struct ggml_cgraph * llm_build_baichaun(
                     break;
                 case MODEL_13B:
                     // TODO: replace with ggml_add()
-                    KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
+                    KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); // TODO: n_head or n_head_kv
                     cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
                     KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
                     break;
@@ -3702,11 +3698,9 @@ static struct ggml_cgraph * llm_build_baichaun(
                     GGML_ASSERT(false);
             }
 
-            // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             cb(KQ_soft_max, "KQ_soft_max", il);
 
-            // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
                         n_kv, n_embd_head, n_head_kv,
@@ -3718,15 +3712,12 @@ static struct ggml_cgraph * llm_build_baichaun(
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             cb(KQV, "KQV", il);
 
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             cb(KQV_merged, "KQV_merged", il);
 
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
             cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             cb(cur, "KQV_merged_contiguous", il);
 
-            // projection (no bias)
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].wo,
                     cur);
@@ -3882,7 +3873,6 @@ static struct ggml_cgraph * llm_build_falcon(
                 cur = attn_norm;
             }
 
-            // compute QKV
             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
             cb(cur, "wqkv", il);
 
@@ -4106,24 +4096,18 @@ static struct ggml_cgraph * llm_build_starcoder(
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
             cb(K, "K", il);
 
-            // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             cb(KQ, "KQ", il);
 
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
             cb(KQ_scaled, "KQ_scaled", il);
 
-            // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
             cb(KQ_masked, "KQ_masked", il);
 
-            // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             cb(KQ_soft_max, "KQ_soft_max", il);
 
-            // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
                         n_kv, n_embd_head, n_head_kv,
@@ -4142,7 +4126,6 @@ static struct ggml_cgraph * llm_build_starcoder(
             cb(cur, "KQV_merged_contiguous", il);
         }
 
-        // Projection
         cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
         cb(cur, "result_wo", il);
 
@@ -4506,8 +4489,6 @@ static struct ggml_cgraph * llm_build_refact(
     const int32_t n_kv     = worst_case ? n_ctx            : kv_self.n;
     const int32_t kv_head  = worst_case ? n_ctx - n_tokens : kv_self.head;
 
-    // printf("n_kv = %d\n", n_kv);
-
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -4584,27 +4565,21 @@ static struct ggml_cgraph * llm_build_refact(
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
             cb(K, "K", il);
 
-            // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             cb(KQ, "KQ", il);
 
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             cb(KQ_scaled, "KQ_scaled", il);
 
-            // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
             cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
 
             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
             cb(KQ_masked, "KQ_masked", il);
 
-            // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             cb(KQ_soft_max, "KQ_soft_max", il);
 
-            // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
                         n_kv, n_embd_head, n_head_kv,
@@ -4616,15 +4591,12 @@ static struct ggml_cgraph * llm_build_refact(
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             cb(KQV, "KQV", il);
 
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             cb(KQV_merged, "KQV_merged", il);
 
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
             cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             cb(cur, "KQV_merged_contiguous", il);
 
-            // projection (no bias)
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].wo,
                     cur);
@@ -4789,27 +4761,21 @@ static struct ggml_cgraph * llm_build_bloom(
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
             cb(K, "K", il);
 
-            // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             cb(KQ, "KQ", il);
 
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
             cb(KQ_scaled, "KQ_scaled", il);
 
             struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
             cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
 
-            // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
             cb(KQ_masked, "KQ_masked", il);
 
-            // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             cb(KQ_soft_max, "KQ_soft_max", il);
 
-            // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
                         n_kv, n_embd_head, n_head_kv,
@@ -4821,16 +4787,13 @@ static struct ggml_cgraph * llm_build_bloom(
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             cb(KQV, "KQV", il);
 
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             cb(KQV_merged, "KQV_merged", il);
 
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
             cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             cb(cur, "KQV_merged_contiguous", il);
         }
 
-        // Projection
         cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
         cb(cur, "result_wo", il);
 

From f39e6075cf40f6af5befeb43049962af6e6a7c0e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 29 Oct 2023 22:26:36 +0200
Subject: [PATCH 10/18] llama : add llm_build_kqv helper

ggml-ci
---
 llama.cpp | 686 +++++++++++++++++-------------------------------------
 1 file changed, 209 insertions(+), 477 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e7862e8f528c1..ae4777cf0d8e8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3093,6 +3093,103 @@ static bool llama_model_load(
 
 using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+// Persimmon: n_rot = n_embd_head/2
+// Other:     n_rot = n_embd_head
+static void llm_build_k_shift(
+        const llama_context & lctx,
+        struct ggml_context * ctx,
+         struct ggml_cgraph * graph,
+                    int64_t   n_rot,
+              llm_rope_type   type,
+         const llm_build_cb & cb) {
+    const auto & model   = lctx.model;
+    const auto & kv_self = lctx.kv_self;
+    const auto & cparams = lctx.cparams;
+
+    const auto & hparams = model.hparams;
+
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_head_kv   = hparams.n_head_kv;
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int64_t n_embd_head = hparams.n_embd_head();
+
+    const int64_t n_ctx = lctx.cparams.n_ctx;
+
+    const float freq_base  = cparams.rope_freq_base;
+    const float freq_scale = cparams.rope_freq_scale;
+
+    GGML_ASSERT(n_embd_head % n_rot == 0);
+
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift, "K_shift", -1);
+
+    int rope_type = 0;
+
+    switch (type) {
+        case LLM_ROPE:      rope_type = 0; break;
+        case LLM_ROPE_NEOX: rope_type = 2; break;
+        case LLM_ROPE_GLM:  rope_type = 4; break;
+    };
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * tmp =
+            // we rotate only the first n_rot dimensions
+            ggml_rope_custom_inplace(ctx,
+                    ggml_view_3d(ctx, kv_self.k,
+                        n_rot, n_head_kv, n_ctx,
+                        ggml_element_size(kv_self.k)*n_embd_head,
+                        ggml_element_size(kv_self.k)*n_embd_gqa,
+                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                    K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
+        cb(tmp, "K_shifted", il);
+        ggml_build_forward_expand(graph, tmp);
+    }
+}
+
+static void llm_build_kv_store(
+        const llama_context & lctx,
+        struct ggml_context * ctx,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+         const llm_build_cb & cb,
+                    int64_t   il) {
+    const auto & model   = lctx.model;
+    const auto & kv_self = lctx.kv_self;
+    const auto & cparams = lctx.cparams;
+
+    const auto & hparams = model.hparams;
+
+    const int64_t n_ctx      = cparams.n_ctx;
+    const int64_t n_embd_gqa = hparams.n_embd_gqa();
+
+    // compute the transposed [n_tokens, n_embd] V matrix
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
+    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+    cb(v_cur_t, "v_cur_t", il);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa,
+                (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+    cb(k_cache_view, "k_cache_view", il);
+
+    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa,
+            (   n_ctx)*ggml_element_size(kv_self.v),
+            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+    cb(v_cache_view, "v_cache_view", il);
+
+    // important: storing RoPE-ed version of K in the KV cache!
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+}
+
 enum llm_norm_type {
     LLM_NORM,
     LLM_NORM_RMS,
@@ -3232,101 +3329,93 @@ static struct ggml_tensor * llm_build_ffn(
     return cur;
 }
 
-enum llm_rope_type {
-    LLM_ROPE,
-    LLM_ROPE_NEOX,
-    LLM_ROPE_GLM,
-};
-
-// Persimmon: n_rot = n_embd_head/2
-// Other:     n_rot = n_embd_head
-static void llm_build_k_shift(
+// if max_alibi_bias > 0 then apply ALiBi
+static struct ggml_tensor * llm_build_kqv(
         const llama_context & lctx,
         struct ggml_context * ctx,
-         struct ggml_cgraph * graph,
-                    int64_t   n_rot,
-              llm_rope_type   type,
-         const llm_build_cb & cb) {
+         struct ggml_tensor * cur,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_scale,
+         struct ggml_tensor * kq_mask,
+                    int32_t   n_tokens,
+                    int32_t   n_kv,
+                      float   alibi_bias_max,
+         const llm_build_cb & cb,
+         int   il) {
     const auto & model   = lctx.model;
     const auto & kv_self = lctx.kv_self;
     const auto & cparams = lctx.cparams;
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_ctx       = cparams.n_ctx;
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_head      = hparams.n_head;
     const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
     const int64_t n_embd_head = hparams.n_embd_head();
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
-    const int64_t n_ctx = lctx.cparams.n_ctx;
+    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+    cb(q, "q", il);
 
-    const float freq_base  = cparams.rope_freq_base;
-    const float freq_scale = cparams.rope_freq_scale;
+    struct ggml_tensor * k =
+        ggml_view_3d(ctx, kv_self.k,
+                n_embd_head, n_kv, n_head_kv,
+                ggml_element_size(kv_self.k)*n_embd_gqa,
+                ggml_element_size(kv_self.k)*n_embd_head,
+                ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+    cb(k, "k", il);
 
-    GGML_ASSERT(n_embd_head % n_rot == 0);
+    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+    cb(kq, "kq", il);
 
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
-    cb(K_shift, "K_shift", -1);
+    kq = ggml_scale(ctx, kq, kq_scale);
+    cb(kq, "kq_scaled", il);
 
-    int rope_type = 0;
+    if (alibi_bias_max > 0.0f) {
+        // TODO: n_head or n_head_kv
+        // TODO: K-shift is likely not working
+        // TODO: change to ggml_add
+        kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max);
+        cb(kq, "kq_scaled_alibi", il);
+    }
 
-    switch (type) {
-        case LLM_ROPE:      rope_type = 0; break;
-        case LLM_ROPE_NEOX: rope_type = 2; break;
-        case LLM_ROPE_GLM:  rope_type = 4; break;
-    };
+    kq = ggml_add(ctx, kq, kq_mask);
+    cb(kq, "kq_masked", il);
 
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * tmp =
-            // we rotate only the first n_rot dimensions
-            ggml_rope_custom_inplace(ctx,
-                    ggml_view_3d(ctx, kv_self.k,
-                        n_rot, n_head_kv, n_ctx,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                    K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
-        cb(tmp, "K_shifted", il);
-        ggml_build_forward_expand(graph, tmp);
-    }
-}
+    kq = ggml_soft_max(ctx, kq);
+    cb(kq, "kq_soft_max", il);
 
-static void llm_build_kv_store(
-        const llama_context & lctx,
-        struct ggml_context * ctx,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-         const llm_build_cb & cb,
-                    int64_t   il) {
-    const auto & model   = lctx.model;
-    const auto & kv_self = lctx.kv_self;
-    const auto & cparams = lctx.cparams;
+    // split cached v into n_head heads
+    struct ggml_tensor * v =
+        ggml_view_3d(ctx, kv_self.v,
+                n_kv, n_embd_head, n_head_kv,
+                ggml_element_size(kv_self.v)*n_ctx,
+                ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+                ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+    cb(v, "v", il);
 
-    const auto & hparams = model.hparams;
+    struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+    cb(kqv, "kqv", il);
 
-    const int64_t n_ctx      = cparams.n_ctx;
-    const int64_t n_embd_gqa = hparams.n_embd_gqa();
+    struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+    cb(kqv_merged, "kqv_merged", il);
 
-    // compute the transposed [n_tokens, n_embd] V matrix
-    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
-    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
-    cb(v_cur_t, "v_cur_t", il);
+    cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
+    cb(cur, "kqv_merged_cont", il);
 
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa,
-                (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-    cb(k_cache_view, "k_cache_view", il);
+    cur = ggml_mul_mat(ctx, wo, cur);
+    if (wo_b) {
+        cb(cur, "kqv_wo", il);
+    }
 
-    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa,
-            (   n_ctx)*ggml_element_size(kv_self.v),
-            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-    cb(v_cache_view, "v_cache_view", il);
+    if (wo_b) {
+        cur = ggml_add(ctx, cur, wo_b);
+    }
 
-    // important: storing RoPE-ed version of K in the KV cache!
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+    return cur;
 }
 
 static struct ggml_cgraph * llm_build_llama(
@@ -3348,7 +3437,6 @@ static struct ggml_cgraph * llm_build_llama(
     const int64_t n_head      = hparams.n_head;
     const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -3440,67 +3528,10 @@ static struct ggml_cgraph * llm_build_llama(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-#if 1
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-#else
-            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
-            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
-            // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            cb(cur, "result_wo", il);
+            cur = llm_build_kqv(lctx, ctx0, cur,
+                    model.layers[il].wo, NULL,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
@@ -3567,7 +3598,6 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int64_t n_head      = hparams.n_head;
     const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -3664,64 +3694,13 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
+            // apply ALiBi for 13B model
+            const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f;
 
-            struct ggml_tensor * KQ_masked;
-            struct ggml_tensor * KQ_scaled_alibi;
-
-            switch (model.type) {
-                case MODEL_7B:
-                    KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-                    break;
-                case MODEL_13B:
-                    // TODO: replace with ggml_add()
-                    KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); // TODO: n_head or n_head_kv
-                    cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
-                    KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-                    break;
-                default:
-                    GGML_ASSERT(false);
-            }
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            cb(cur, "result_wo", il);
+            cur = llm_build_kqv(lctx, ctx0, cur,
+                    model.layers[il].wo, NULL,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
@@ -3896,48 +3875,10 @@ static struct ggml_cgraph * llm_build_falcon(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            cb(cur, "result_wo", il);
+            cur = llm_build_kqv(lctx, ctx0, attn_norm,
+                    model.layers[il].wo, NULL,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         struct ggml_tensor * attn_out = cur;
@@ -3998,7 +3939,6 @@ static struct ggml_cgraph * llm_build_starcoder(
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
     const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
@@ -4085,50 +4025,12 @@ static struct ggml_cgraph * llm_build_starcoder(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
+            cur = llm_build_kqv(lctx, ctx0, cur,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
-        cb(cur, "result_wo", il);
-
         // Add the input
         cur = ggml_add(ctx0, cur, inpL);
         cb(cur, "inpL_+_result_wo", il);
@@ -4190,7 +4092,6 @@ static struct ggml_cgraph * llm_build_persimmon(
     const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_head      = hparams.n_head;
     const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
     const int64_t n_rot       = n_embd_head / 2;
 
     const float freq_base  = cparams.rope_freq_base;
@@ -4376,47 +4277,11 @@ static struct ggml_cgraph * llm_build_persimmon(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
-                    n_embd_head, n_kv, n_head_kv,
-                    ggml_element_size(kv_self.k)*n_embd_gqa,
-                    ggml_element_size(kv_self.k)*n_embd_head,
-                    ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            cb(cur, "result_wo", il);
-
-            cur = ggml_add(ctx0, cur, model.layers[il].bo);
-            cb(cur, "result_wo_b", il);
+            // TODO: not tested, could be broken
+            cur = llm_build_kqv(lctx, ctx0, Q,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
@@ -4481,7 +4346,6 @@ static struct ggml_cgraph * llm_build_refact(
     const int64_t n_head      = hparams.n_head;
     const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
     const float norm_rms_eps = hparams.f_norm_rms_eps;
 
@@ -4554,53 +4418,10 @@ static struct ggml_cgraph * llm_build_refact(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
-            cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            cb(cur, "result_wo", il);
+            cur = llm_build_kqv(lctx, ctx0, Qcur,
+                    model.layers[il].wo, NULL,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
@@ -4665,7 +4486,6 @@ static struct ggml_cgraph * llm_build_bloom(
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
     const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
@@ -4750,56 +4570,12 @@ static struct ggml_cgraph * llm_build_bloom(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
-            cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
+            cur = llm_build_kqv(lctx, ctx0, Qcur,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-        cb(cur, "result_wo", il);
-
-        cur = ggml_add(ctx0, cur, model.layers[il].bo);
-        cb(cur, "result_wo_b", il);
-
         // Add the input
         cur = ggml_add(ctx0, cur, inpL);
         cb(cur, "inpL_+_result_wo", il);
@@ -4859,7 +4635,6 @@ static struct ggml_cgraph * llm_build_mpt(
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
     const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
@@ -4943,52 +4718,10 @@ static struct ggml_cgraph * llm_build_mpt(
 
             llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            cb(Q, "Q", il);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            cb(K, "K", il);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            cb(KQ, "KQ", il);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            cb(KQ_scaled, "KQ_scaled", il);
-
-            // TODO: replace with ggml_add()
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
-            cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            cb(KQ_masked, "KQ_masked", il);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            cb(KQ_soft_max, "KQ_soft_max", il);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            cb(V, "V", il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            cb(KQV, "KQV", il);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cb(KQV_merged, "KQV_merged", il);
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            cb(cur, "KQV_merged_contiguous", il);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            cb(cur, "result_wo", il);
+            cur = llm_build_kqv(lctx, ctx0, Qcur,
+                    model.layers[il].wo, NULL,
+                    Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il);
+            cb(cur, "kqv_out", il);
         }
 
         // Add the input
@@ -5164,22 +4897,21 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "krotated",                   OFFLOAD_FUNC_KQ  },
     { "qrotated",                   OFFLOAD_FUNC_KQ  },
 
-    { "Q",                          OFFLOAD_FUNC_KQ  },
-    { "K",                          OFFLOAD_FUNC_KQ  },
-    { "KQ",                         OFFLOAD_FUNC_KQ  },
-    { "KQ_scaled",                  OFFLOAD_FUNC_KQ  },
-    { "KQ_scaled_alibi",            OFFLOAD_FUNC_KQ  },
-    { "KQ_masked",                  OFFLOAD_FUNC_KQ  },
-    { "KQ_soft_max",                OFFLOAD_FUNC_V   },
-    { "V",                          OFFLOAD_FUNC_V   },
-    { "KQV",                        OFFLOAD_FUNC_V   },
-    { "KQV_merged",                 OFFLOAD_FUNC_V   },
-    { "KQV_merged_contiguous",      OFFLOAD_FUNC_V   },
-
-    { "result_wo",                  OFFLOAD_FUNC     },
-    { "result_wo_b",                OFFLOAD_FUNC     },
-    { "inpL_+_result_wo",           OFFLOAD_FUNC     },
+    { "q",                          OFFLOAD_FUNC_KQ  },
+    { "k",                          OFFLOAD_FUNC_KQ  },
+    { "kq",                         OFFLOAD_FUNC_KQ  },
+    { "kq_scaled",                  OFFLOAD_FUNC_KQ  },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
+    { "kq_masked",                  OFFLOAD_FUNC_KQ  },
+    { "kq_soft_max",                OFFLOAD_FUNC_V   },
+    { "v",                          OFFLOAD_FUNC_V   },
+    { "kqv",                        OFFLOAD_FUNC_V   },
+    { "kqv_merged",                 OFFLOAD_FUNC_V   },
+    { "kqv_merged_cont",            OFFLOAD_FUNC_V   },
+    { "kqv_wo",                     OFFLOAD_FUNC_V   },
+    { "kqv_out",                    OFFLOAD_FUNC_V   },
 
+    { "inpL_+_result_wo",           OFFLOAD_FUNC     },
     { "inpFF",                      OFFLOAD_FUNC     },
 
     { "ffn_norm",                   OFFLOAD_FUNC     },

From 792d1a1b167a3ff41684652a62ba2e3a1444df9c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 30 Oct 2023 11:34:47 +0200
Subject: [PATCH 11/18] llama : minor

---
 llama.cpp | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ae4777cf0d8e8..d6b2d7289990a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3503,8 +3503,7 @@ static struct ggml_cgraph * llm_build_llama(
 
         // norm
         cur = llm_build_norm(ctx0, inpL,
-                model.layers[il].attn_norm,
-                NULL,
+                model.layers[il].attn_norm, NULL,
                 LLM_NORM_RMS, norm_rms_eps, cb, il);
         cb(cur, "attn_norm", il);
 
@@ -3540,8 +3539,7 @@ static struct ggml_cgraph * llm_build_llama(
         // feed-forward network
         {
             cur = llm_build_norm(ctx0, inpFF,
-                    model.layers[il].ffn_norm,
-                    NULL,
+                    model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
@@ -3563,8 +3561,7 @@ static struct ggml_cgraph * llm_build_llama(
     cur = inpL;
 
     cur = llm_build_norm(ctx0, cur,
-            model.output_norm,
-            NULL,
+            model.output_norm, NULL,
             LLM_NORM_RMS, norm_rms_eps, cb, -1);
     cb(cur, "result_norm", -1);
 
@@ -3661,8 +3658,7 @@ static struct ggml_cgraph * llm_build_baichaun(
         struct ggml_tensor * inpSA = inpL;
 
         cur = llm_build_norm(ctx0, inpL,
-                model.layers[il].attn_norm,
-                NULL,
+                model.layers[il].attn_norm, NULL,
                 LLM_NORM_RMS, norm_rms_eps, cb, il);
         cb(cur, "attn_norm", il);
 
@@ -3709,8 +3705,7 @@ static struct ggml_cgraph * llm_build_baichaun(
         // feed-forward network
         {
             cur = llm_build_norm(ctx0, inpFF,
-                    model.layers[il].ffn_norm,
-                    NULL,
+                    model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
@@ -3732,8 +3727,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     cur = inpL;
 
     cur = llm_build_norm(ctx0, cur,
-            model.output_norm,
-            NULL,
+            model.output_norm, NULL,
             LLM_NORM_RMS, norm_rms_eps, cb, -1);
     cb(cur, "result_norm", -1);
 
@@ -4394,8 +4388,7 @@ static struct ggml_cgraph * llm_build_refact(
         struct ggml_tensor * inpSA = inpL;
 
         cur = llm_build_norm(ctx0, inpL,
-                model.layers[il].attn_norm,
-                NULL,
+                model.layers[il].attn_norm, NULL,
                 LLM_NORM_RMS, norm_rms_eps, cb, il);
         cb(cur, "attn_norm", il);
 
@@ -4430,8 +4423,7 @@ static struct ggml_cgraph * llm_build_refact(
         // feed-forward network
         {
             cur = llm_build_norm(ctx0, inpFF,
-                    model.layers[il].ffn_norm,
-                    NULL,
+                    model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
 
@@ -4453,8 +4445,7 @@ static struct ggml_cgraph * llm_build_refact(
     cur = inpL;
 
     cur = llm_build_norm(ctx0, cur,
-            model.output_norm,
-            NULL,
+            model.output_norm, NULL,
             LLM_NORM_RMS, norm_rms_eps, cb, -1);
     cb(cur, "result_norm", -1);
 

From a3f80013adf4837326583c1ff3285cbbe66e2d8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 30 Oct 2023 12:14:23 +0200
Subject: [PATCH 12/18] llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder
 offloading

---
 llama.cpp | 109 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 74 insertions(+), 35 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d6b2d7289990a..a3a4ba6f6b0c5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama(
                     model.layers[il].ffn_gate, NULL,
                     model.layers[il].ffn_down, NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);
 
         // input for next layer
         inpL = cur;
@@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun(
                     model.layers[il].ffn_gate, NULL,
                     model.layers[il].ffn_down, NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);
 
         // input for next layer
         inpL = cur;
@@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon(
                     NULL,                      NULL,
                     model.layers[il].ffn_down, NULL,
                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);
 
         cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_inpFF_+_result_w2", il);
+        cb(cur, "inpL_inpFF_ffn_out", il);
 
         // input for next layer
         inpL = cur;
@@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     cb(KQ_mask, "KQ_mask", -1);
 
     pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos);
+    cb(pos, "pos_embd", -1);
 
     inpL = ggml_add(ctx0, embd, pos);
     cb(inpL, "inpL", -1);
@@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder(
 
         // Add the input
         cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);
 
         struct ggml_tensor * inpFF = cur;
 
@@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder(
                     NULL,                      NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         inpL = ggml_add(ctx0, cur, inpFF);
-
+        cb(inpL, "inpL_inpFF_ffn_out", il);
     }
 
     cur = llm_build_norm(ctx0, inpL,
@@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon(
                     NULL,                      NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                     LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);
 
         inpL = cur;
     }
@@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact(
                     model.layers[il].ffn_gate, NULL,
                     model.layers[il].ffn_down, NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);
 
         // input for next layer
         inpL = cur;
@@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom(
 
         // Add the input
         cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);
 
         struct ggml_tensor * inpFF = cur;
 
@@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom(
                     NULL,                      NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         inpL = ggml_add(ctx0, cur, inpFF);
-        cb(inpL, "inpFF_+_result_w2", il);
+        cb(inpL, "inpFF_ffn_out", il);
     }
 
     cur = llm_build_norm(ctx0, inpL,
@@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt(
 
         // Add the input
         cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);
 
         struct ggml_tensor * attn_out = cur;
 
@@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt(
                     NULL,                      NULL,
                     model.layers[il].ffn_down, NULL,
                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
         }
 
         cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpL_+_inpFF_+_result_w2", il);
+        cb(cur, "inpL_inpFF_ffn_out", il);
 
         // input for next layer
         inpL = cur;
@@ -4777,6 +4778,7 @@ enum llm_offload_func_e {
     OFFLOAD_FUNC_OUT,
 };
 
+// TODO: will be removed with backend v2
 struct llm_offload_trie {
     struct node {
         ~node() {
@@ -4850,10 +4852,12 @@ struct llm_offload_trie {
     node * root = nullptr;
 };
 
+// TODO: will be removed with backend v2
 static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
   //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
     { "inp_pos",                    OFFLOAD_FUNC_NR  },
+    { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
     { "KQ_mask",                    OFFLOAD_FUNC_NR  },
     { "K_shift",                    OFFLOAD_FUNC_NR  },
@@ -4902,7 +4906,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "kqv_wo",                     OFFLOAD_FUNC_V   },
     { "kqv_out",                    OFFLOAD_FUNC_V   },
 
-    { "inpL_+_result_wo",           OFFLOAD_FUNC     },
+    { "inpL_kqv_out",               OFFLOAD_FUNC     },
     { "inpFF",                      OFFLOAD_FUNC     },
 
     { "ffn_norm",                   OFFLOAD_FUNC     },
@@ -4914,15 +4918,15 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "ffn_gate_par",               OFFLOAD_FUNC     },
     { "ffn_down",                   OFFLOAD_FUNC     },
     { "ffn_down_b",                 OFFLOAD_FUNC     },
-    { "ffn_result",                 OFFLOAD_FUNC     },
+    { "ffn_out",                    OFFLOAD_FUNC     },
 
     { "ffn_silu",                   OFFLOAD_FUNC     },
     { "ffn_gelu",                   OFFLOAD_FUNC     },
     { "ffn_relu",                   OFFLOAD_FUNC     },
     { "ffn_sqr(relu)",              OFFLOAD_FUNC     },
 
-    { "inpFF_+_result_w2",          OFFLOAD_FUNC     },
-    { "inpL_+_inpFF_+_result_w2",   OFFLOAD_FUNC     },
+    { "inpFF_ffn_out",              OFFLOAD_FUNC     },
+    { "inpL_inpFF_ffn_out",         OFFLOAD_FUNC     },
 
     { "result_norm",                OFFLOAD_FUNC_EMB },
     { "result_output",              OFFLOAD_FUNC_OUT },
@@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph(
     bool alloc_inp_KQ_mask  = false;
     bool alloc_inp_K_shift  = false;
 
+#ifdef GGML_USE_CUBLAS
+    const bool do_offload = true;
+#else
+    const bool do_offload = true; // TODO: set to false after finishing refactoring
+#endif
+
+    int n_non_view = 0; // number of non-view tensors that have been processed by the callback
+
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
         if (il >= 0) {
@@ -5053,26 +5065,26 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_K_shift = true;
         }
 
+        // view tensors are not processed further
+        if (cur->view_src != nullptr) {
+            return;
+        }
+
+        if (cur->op != GGML_OP_NONE) {
+            n_non_view++;
+        }
+
         //
         // offload layers
         //
-        // TODO: this code will be obsoleted with backend v2
+        // TODO: will be removed with backend v2
 
-#ifdef GGML_USE_CUBLAS
-        const bool do_offload = true;
-#else
-        const bool do_offload = true; // TODO: set to false after finishing refactoring
-#endif
+//#define LLAMA_OFFLOAD_DEBUG
 
         if (!do_offload) {
             return;
         }
 
-        // view tensors are not offloaded
-        if (cur->view_src != nullptr) {
-            return;
-        }
-
         const int n_layer = model.hparams.n_layer;
 
         const int n_gpu_layers = model.n_gpu_layers;
@@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph(
         llm_offload_func_e func_e = k_offload_func_trie.find(name);
 
         if (func_e == OFFLOAD_FUNC_NOP) {
+#ifdef LLAMA_OFFLOAD_DEBUG
             // if a tensor hasn't been offloaded, we warn the user
             if (worst_case) {
                 LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
                         cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
             }
+#endif
 
             return;
         }
@@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph(
         // apply offload function to the tensor
         func(cur);
 
+#ifdef LLAMA_OFFLOAD_DEBUG
         if (worst_case) {
             LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
         }
+#endif
     };
 
     struct ggml_cgraph * result = NULL;
@@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
+    if (worst_case) {
+        int n_non_view_total = 0;
+
+        for (int i = 0; i < result->n_nodes; ++i) {
+            if (result->nodes[i]->view_src == nullptr) {
+                n_non_view_total++;
+            }
+        }
+
+        LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
+
+#ifdef LLAMA_OFFLOAD_DEBUG
+        if (n_non_view != n_non_view_total) {
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+            LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__);
+            LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n",    __func__);
+            LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n",                     __func__);
+            LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__);
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+        }
+#endif
+    }
+
     return result;
 }
 

From 2926ef63b179537627d6722c58cc2d666ac7d30e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 08:23:43 +0200
Subject: [PATCH 13/18] llama : fix input allocation logic

---
 llama.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a3a4ba6f6b0c5..75a74c5a473dc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4970,10 +4970,10 @@ static struct ggml_cgraph * llama_build_graph(
         // allocate input tensors and set input data
         //
 
-        if (batch.token && !alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
+        if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
                 const int64_t n_tokens = cur->ne[0];
 
                 memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
@@ -4982,10 +4982,10 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_tokens = true;
         }
 
-        if (batch.embd && !alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
+        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
                 const int64_t n_embd   = cur->ne[0];
                 const int64_t n_tokens = cur->ne[1];
 
@@ -4995,10 +4995,10 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_embd = true;
         }
 
-        if (batch.pos && !alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
+        if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
                 const int64_t n_tokens = cur->ne[0];
 
                 int32_t * data = (int32_t *) cur->data;

From 6669cd8329e443ca4fae635a47da6318767b0f7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 08:24:07 +0200
Subject: [PATCH 14/18] llama : update offload functions for KQ tensors

---
 llama.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 75a74c5a473dc..e744fa217b84b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4856,12 +4856,13 @@ struct llm_offload_trie {
 static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
   //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
-    { "inp_pos",                    OFFLOAD_FUNC_NR  },
     { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
-    { "KQ_mask",                    OFFLOAD_FUNC_NR  },
-    { "K_shift",                    OFFLOAD_FUNC_NR  },
-    { "K_shifted",                  OFFLOAD_FUNC_NR  },
+    { "inp_pos",                    OFFLOAD_FUNC_KQ  }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale",                   OFFLOAD_FUNC_KQ  },
+    { "KQ_mask",                    OFFLOAD_FUNC_KQ  },
+    { "K_shift",                    OFFLOAD_FUNC_KQ  },
+    { "K_shifted",                  OFFLOAD_FUNC_KQ  },
 
     { "inp_norm",                   OFFLOAD_FUNC_NR  },
     { "inp_norm_w",                 OFFLOAD_FUNC_NR  },

From 0bfdcdd0f83ed637179e5ac49a0ddd551763634d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 08:46:34 +0200
Subject: [PATCH 15/18] llama : normalize tensor names

ggml-ci
---
 llama.cpp | 95 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 51 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e744fa217b84b..f69af36ec1e4a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3533,12 +3533,12 @@ static struct ggml_cgraph * llm_build_llama(
             cb(cur, "kqv_out", il);
         }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        cb(inpFF, "inpFF", il);
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
 
         // feed-forward network
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
@@ -3551,8 +3551,8 @@ static struct ggml_cgraph * llm_build_llama(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         // input for next layer
         inpL = cur;
@@ -3699,12 +3699,12 @@ static struct ggml_cgraph * llm_build_baichaun(
             cb(cur, "kqv_out", il);
         }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        cb(inpFF, "inpFF", il);
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
 
         // feed-forward network
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
@@ -3717,8 +3717,8 @@ static struct ggml_cgraph * llm_build_baichaun(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         // input for next layer
         inpL = cur;
@@ -3875,7 +3875,7 @@ static struct ggml_cgraph * llm_build_falcon(
             cb(cur, "kqv_out", il);
         }
 
-        struct ggml_tensor * attn_out = cur;
+        struct ggml_tensor * ffn_inp = cur;
 
         // feed forward
         {
@@ -3887,11 +3887,11 @@ static struct ggml_cgraph * llm_build_falcon(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_inpFF_ffn_out", il);
+        cb(cur, "l_out", il);
 
         // input for next layer
         inpL = cur;
@@ -4026,15 +4026,13 @@ static struct ggml_cgraph * llm_build_starcoder(
             cb(cur, "kqv_out", il);
         }
 
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_kqv_out", il);
-
-        struct ggml_tensor * inpFF = cur;
+        // add the input
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
 
         // FF
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
                     LLM_NORM, norm_eps, cb, il);
@@ -4048,8 +4046,8 @@ static struct ggml_cgraph * llm_build_starcoder(
             cb(cur, "ffn_out", il);
         }
 
-        inpL = ggml_add(ctx0, cur, inpFF);
-        cb(inpL, "inpL_inpFF_ffn_out", il);
+        inpL = ggml_add(ctx0, cur, ffn_inp);
+        cb(inpL, "l_out", il);
     }
 
     cur = llm_build_norm(ctx0, inpL,
@@ -4279,12 +4277,12 @@ static struct ggml_cgraph * llm_build_persimmon(
             cb(cur, "kqv_out", il);
         }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
-        cb(inpFF, "inpFF", il);
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+        cb(ffn_inp, "ffn_inp", il);
 
         // feed-forward network
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
                     LLM_NORM, norm_eps, cb, il);
@@ -4298,8 +4296,8 @@ static struct ggml_cgraph * llm_build_persimmon(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         inpL = cur;
     }
@@ -4418,12 +4416,12 @@ static struct ggml_cgraph * llm_build_refact(
             cb(cur, "kqv_out", il);
         }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        cb(inpFF, "inpFF", il);
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
 
         // feed-forward network
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, norm_rms_eps, cb, il);
             cb(cur, "ffn_norm", il);
@@ -4436,8 +4434,8 @@ static struct ggml_cgraph * llm_build_refact(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         // input for next layer
         inpL = cur;
@@ -4569,14 +4567,12 @@ static struct ggml_cgraph * llm_build_bloom(
         }
 
         // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_kqv_out", il);
-
-        struct ggml_tensor * inpFF = cur;
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
 
         // FF
         {
-            cur = llm_build_norm(ctx0, inpFF,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
                     LLM_NORM, norm_eps, cb, il);
@@ -4590,8 +4586,8 @@ static struct ggml_cgraph * llm_build_bloom(
             cb(cur, "ffn_out", il);
         }
 
-        inpL = ggml_add(ctx0, cur, inpFF);
-        cb(inpL, "inpFF_ffn_out", il);
+        inpL = ggml_add(ctx0, cur, ffn_inp);
+        cb(inpL, "l_out", il);
     }
 
     cur = llm_build_norm(ctx0, inpL,
@@ -4717,14 +4713,12 @@ static struct ggml_cgraph * llm_build_mpt(
         }
 
         // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_kqv_out", il);
-
-        struct ggml_tensor * attn_out = cur;
+        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
 
         // feed forward
         {
-            cur = llm_build_norm(ctx0, attn_out,
+            cur = llm_build_norm(ctx0, ffn_inp,
                     model.layers[il].ffn_norm,
                     NULL,
                     LLM_NORM, norm_eps, cb, il);
@@ -4738,8 +4732,8 @@ static struct ggml_cgraph * llm_build_mpt(
             cb(cur, "ffn_out", il);
         }
 
-        cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpL_inpFF_ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
 
         // input for next layer
         inpL = cur;
@@ -4907,9 +4901,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "kqv_wo",                     OFFLOAD_FUNC_V   },
     { "kqv_out",                    OFFLOAD_FUNC_V   },
 
-    { "inpL_kqv_out",               OFFLOAD_FUNC     },
-    { "inpFF",                      OFFLOAD_FUNC     },
-
+    { "ffn_inp",                    OFFLOAD_FUNC     },
     { "ffn_norm",                   OFFLOAD_FUNC     },
 
     { "ffn_up",                     OFFLOAD_FUNC     },
@@ -4926,8 +4918,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "ffn_relu",                   OFFLOAD_FUNC     },
     { "ffn_sqr(relu)",              OFFLOAD_FUNC     },
 
-    { "inpFF_ffn_out",              OFFLOAD_FUNC     },
-    { "inpL_inpFF_ffn_out",         OFFLOAD_FUNC     },
+    { "l_out",                      OFFLOAD_FUNC     },
 
     { "result_norm",                OFFLOAD_FUNC_EMB },
     { "result_output",              OFFLOAD_FUNC_OUT },
@@ -4960,6 +4951,7 @@ static struct ggml_cgraph * llama_build_graph(
     int n_non_view = 0; // number of non-view tensors that have been processed by the callback
 
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+    // TODO: will be removed with backend v2
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
         if (il >= 0) {
             ggml_format_name(cur, "%s-%d", name, il);
@@ -4970,6 +4962,7 @@ static struct ggml_cgraph * llama_build_graph(
         //
         // allocate input tensors and set input data
         //
+        // TODO: will be removed with backend v2
 
         if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
             ggml_allocr_alloc(lctx.alloc, cur);

From fc5a26aadea54e2bcf6dd384e1ca0c846575bc0c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 08:57:10 +0200
Subject: [PATCH 16/18] llama : enable warning about not offloaded tensors

---
 llama.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f69af36ec1e4a..68cb835e1ee21 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5235,7 +5235,6 @@ static struct ggml_cgraph * llama_build_graph(
 
         LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
 
-#ifdef LLAMA_OFFLOAD_DEBUG
         if (n_non_view != n_non_view_total) {
             LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
             LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__);
@@ -5244,7 +5243,6 @@ static struct ggml_cgraph * llama_build_graph(
             LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__);
             LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
         }
-#endif
     }
 
     return result;

From 2073347e3bbbf437c2dff5df22baa1f754468633 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 16:28:09 +0200
Subject: [PATCH 17/18] llama : remove extra ; + deduplicate gate_b logic

---
 llama.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 68cb835e1ee21..5fed1c80578e4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3135,7 +3135,7 @@ static void llm_build_k_shift(
         case LLM_ROPE:      rope_type = 0; break;
         case LLM_ROPE_NEOX: rope_type = 2; break;
         case LLM_ROPE_GLM:  rope_type = 4; break;
-    };
+    }
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * tmp =
@@ -3207,7 +3207,8 @@ static struct ggml_tensor * llm_build_norm(
     switch (type) {
         case LLM_NORM:     cur = ggml_norm    (ctx, cur, eps); break;
         case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break;
-    };
+    }
+
     if (mw || mb) {
         cb(cur, "norm", il);
     }
@@ -3265,23 +3266,18 @@ static struct ggml_tensor * llm_build_ffn(
                 {
                     cur = ggml_mul_mat(ctx, gate, tmp);
                     cb(cur, "ffn_gate", il);
-
-                    if (gate_b) {
-                        cur = ggml_add(ctx, cur, gate_b);
-                        cb(cur, "ffn_gate_b", il);
-                    }
                 } break;
             case LLM_FFN_PAR:
                 {
                     cur = ggml_mul_mat(ctx, gate, cur);
                     cb(cur, "ffn_gate", il);
-
-                    if (gate_b) {
-                        cur = ggml_add(ctx, cur, gate_b);
-                        cb(cur, "ffn_gate_b", il);
-                    }
                 } break;
-        };
+        }
+
+        if (gate_b) {
+            cur = ggml_add(ctx, cur, gate_b);
+            cb(cur, "ffn_gate_b", il);
+        }
     } else {
         cur = tmp;
     }
@@ -3310,7 +3306,7 @@ static struct ggml_tensor * llm_build_ffn(
                 cur = ggml_sqr(ctx, cur);
                 cb(cur, "ffn_sqr(relu)", il);
             } break;
-    };
+    }
 
     if (type_gate == LLM_FFN_PAR) {
         cur = ggml_mul(ctx, cur, tmp);
@@ -4098,6 +4094,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     const bool do_rope_shift  = worst_case || kv_self.has_shift;
 
     auto & buf_compute = lctx.buf_compute;
+
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute.size,
         /*.mem_buffer =*/ buf_compute.data,

From 7923b70cb8033ccf4097b7a95013fa6b0ad22ecd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 16:43:08 +0200
Subject: [PATCH 18/18] llama : add llm_build_inp_embd helper

---
 llama.cpp | 161 +++++++++++++++++-------------------------------------
 1 file changed, 50 insertions(+), 111 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5fed1c80578e4..f3db4dc21c11b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1228,8 +1228,8 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
-    struct ggml_tensor * tok_embeddings;
-    struct ggml_tensor * pos_embeddings;
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
     struct ggml_tensor * tok_norm;
     struct ggml_tensor * tok_norm_b;
 
@@ -2484,7 +2484,7 @@ static void llm_load_tensors(
             case LLM_ARCH_LLAMA:
             case LLM_ARCH_REFACT:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2552,7 +2552,7 @@ static void llm_load_tensors(
                 } break;
             case LLM_ARCH_BAICHUAN:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
                         ggml_backend_type backend_norm;
                         ggml_backend_type backend_output;
@@ -2620,7 +2620,7 @@ static void llm_load_tensors(
                 {
                     // TODO: CPU-only for now
 
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2696,8 +2696,8 @@ static void llm_load_tensors(
                 } break;
             case LLM_ARCH_STARCODER:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2775,7 +2775,7 @@ static void llm_load_tensors(
                 } break;
             case LLM_ARCH_PERSIMMON:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     {
                         ggml_backend_type backend_norm;
@@ -2838,9 +2838,9 @@ static void llm_load_tensors(
                 {
                     // TODO: CPU-only for now
 
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.tok_norm       = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
-                    model.tok_norm_b     = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2918,7 +2918,7 @@ static void llm_load_tensors(
                 } break;
             case LLM_ARCH_MPT:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -3099,6 +3099,31 @@ enum llm_rope_type {
     LLM_ROPE_GLM,
 };
 
+static struct ggml_tensor * llm_build_inp_embd(
+        struct ggml_context * ctx,
+          const llama_batch & batch,
+         struct ggml_tensor * tok_embd,
+                    int64_t   n_embd,
+                    int32_t   n_tokens,
+         const llm_build_cb & cb) {
+    struct ggml_tensor * inpL;
+
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        cb(inp_tokens, "inp_tokens", -1);
+
+        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+    } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+    }
+
+    return inpL;
+}
+
 // Persimmon: n_rot = n_embd_head/2
 // Other:     n_rot = n_embd_head
 static void llm_build_k_shift(
@@ -3463,18 +3488,7 @@ static struct ggml_cgraph * llm_build_llama(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "inp_embd", -1);
 
     // inp_pos - contains the positions
@@ -3619,18 +3633,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "inp_embd", -1);
 
     // inp_pos - contains the positions
@@ -3789,18 +3792,7 @@ static struct ggml_cgraph * llm_build_falcon(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "inp_embd", -1);
 
     // inp_pos - contains the positions
@@ -3953,23 +3945,11 @@ static struct ggml_cgraph * llm_build_starcoder(
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * cur;
-    struct ggml_tensor * embd;
     struct ggml_tensor * pos;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
-    cb(embd, "inp_embd", -1);
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+    cb(inpL, "inp_embd", -1);
 
     // inp_pos - contains the positions
     struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -3983,10 +3963,10 @@ static struct ggml_cgraph * llm_build_starcoder(
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     cb(KQ_mask, "KQ_mask", -1);
 
-    pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos);
+    pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
     cb(pos, "pos_embd", -1);
 
-    inpL = ggml_add(ctx0, embd, pos);
+    inpL = ggml_add(ctx0, inpL, pos);
     cb(inpL, "inpL", -1);
 
     for (int il = 0; il < n_layer; ++il) {
@@ -4108,14 +4088,7 @@ static struct ggml_cgraph * llm_build_persimmon(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "imp_embd", -1);
 
     struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -4358,18 +4331,7 @@ static struct ggml_cgraph * llm_build_refact(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "inp_embd", -1);
 
     // KQ_scale
@@ -4499,22 +4461,10 @@ static struct ggml_cgraph * llm_build_bloom(
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * cur;
-    struct ggml_tensor * embd;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
-    cb(embd, "inp_embd", -1);
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+    cb(inpL, "inp_embd", -1);
 
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -4524,7 +4474,7 @@ static struct ggml_cgraph * llm_build_bloom(
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     cb(KQ_mask, "KQ_mask", -1);
 
-    inpL = llm_build_norm(ctx0, embd,
+    inpL = llm_build_norm(ctx0, inpL,
             model.tok_norm,
             model.tok_norm_b,
             LLM_NORM, norm_eps, cb, -1);
@@ -4648,18 +4598,7 @@ static struct ggml_cgraph * llm_build_mpt(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
     cb(inpL, "inp_embd", -1);
 
     // KQ_scale