From f85020b19ac853a6bbad6092e0cc344e27553aea Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 20:49:24 +0300
Subject: [PATCH 01/49] mtl : export the LLaMA computation graph

---
 examples/CMakeLists.txt     |  1 +
 examples/mtl/CMakeLists.txt |  7 ++++++
 examples/mtl/mtl-export.cpp | 25 +++++++++++++++++++++
 llama.cpp                   | 44 ++++++++++++++++++++++++++++---------
 llama.h                     |  4 ++++
 5 files changed, 71 insertions(+), 10 deletions(-)
 create mode 100644 examples/mtl/CMakeLists.txt
 create mode 100644 examples/mtl/mtl-export.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e4ce5aca7b98b..97a3ffd1b6db7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,6 +37,7 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
+    add_subdirectory(mtl)
     if(LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
new file mode 100644
index 0000000000000..4dc0bc596bfe5
--- /dev/null
+++ b/examples/mtl/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(TARGET mtl-export)
+add_executable(${TARGET} mtl-export.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/mtl/mtl-export.cpp b/examples/mtl/mtl-export.cpp
new file mode 100644
index 0000000000000..7872182a111bb
--- /dev/null
+++ b/examples/mtl/mtl-export.cpp
@@ -0,0 +1,25 @@
+#include "common.h"
+#include "llama.h"
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    llama_init_backend();
+
+    llama_context * ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    llama_eval_export(ctx, "llama.ggml");
+
+    llama_print_timings(ctx);
+    llama_free(ctx);
+
+    return 0;
+}
diff --git a/llama.cpp b/llama.cpp
index 5a19316b39127..9dccf0ed15362 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1189,17 +1189,19 @@ static bool llama_model_load(
 
 // evaluate the transformer
 //
-//   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the context size so far
-//   - n_threads: number of threads to use
+//   - lctx:         llama context
+//   - tokens:       new batch of tokens to process
+//   - n_past:       the context size so far
+//   - n_threads:    number of threads to use
+//   - cgraph_fname: filename of the exported computation graph (TODO: TMP!!!)
 //
 static bool llama_eval_internal(
-        llama_context & lctx,
-    const llama_token * tokens,
-            const int   n_tokens,
-            const int   n_past,
-            const int   n_threads) {
+        llama_context &  lctx,
+    const llama_token *  tokens,
+            const int    n_tokens,
+            const int    n_past,
+            const int    n_threads,
+            const char * cgraph_fname) {
 
     // enforce that the first token is BOS
     if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1422,6 +1424,10 @@ static bool llama_eval_internal(
     ggml_build_forward_expand(&gf, inpL);
     ggml_graph_compute       (ctx0, &gf);
 
+    if (cgraph_fname) {
+        ggml_graph_export(&gf, cgraph_fname);
+    }
+
 #ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
     // requires GGML_PERF to be defined
@@ -2899,7 +2905,7 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
+    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -2914,6 +2920,24 @@ int llama_eval(
     return 0;
 }
 
+int llama_eval_export(struct llama_context * ctx, const char * fname) {
+    // these values determine the maximum inference sizes of the exported computation graph
+    // TODO: TMP !!!
+    //const int n_ctx   = ctx->model.hparams.n_ctx;
+    //const int n_batch = 512;
+    const int n_ctx   = 128;
+    const int n_batch = 32;
+
+    const std::vector<llama_token> tmp(n_batch, llama_token_bos());
+
+    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
+        return 1;
+    }
+
+    return 0;
+}
+
 int llama_tokenize(
         struct llama_context * ctx,
                   const char * text,
diff --git a/llama.h b/llama.h
index c6b0a2889f8de..3ba0775bd8a38 100644
--- a/llama.h
+++ b/llama.h
@@ -173,6 +173,10 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
 
+    // Export a computation graph for model inference
+    // TODO: very likely to change
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens

From 98c267fc77fe811082f672538fc91bcfc9072d63 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 20:57:24 +0300
Subject: [PATCH 02/49] ci : disable temporary

---
 .github/workflows/editorconfig.yml | 17 -----------------
 .github/workflows/tidy-post.yml    | 20 --------------------
 .github/workflows/tidy-review.yml  | 23 -----------------------
 3 files changed, 60 deletions(-)
 delete mode 100644 .github/workflows/editorconfig.yml
 delete mode 100644 .github/workflows/tidy-post.yml
 delete mode 100644 .github/workflows/tidy-review.yml

diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index b4e535acf1f64..0000000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: editorconfig-checker/action-editorconfig-checker@main
-      - run: editorconfig-checker
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
deleted file mode 100644
index a58da0cd6493d..0000000000000
--- a/.github/workflows/tidy-post.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
-  workflow_run:
-    workflows: ["clang-tidy-review"]
-    types:
-      - completed
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: ZedThree/clang-tidy-review/post@v0.13.0
-        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
-        with:
-          # adjust options as necessary
-          lgtm_comment_body: ''
-          annotations: false
-          max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
deleted file mode 100644
index a4bc8d976560e..0000000000000
--- a/.github/workflows/tidy-review.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  clang-tidy-review:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - uses: ZedThree/clang-tidy-review@v0.13.0
-      id: review
-      with:
-        lgtm_comment_body: ''
-        build_dir: build
-        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
-        split_workflow: true
-
-    - uses: ZedThree/clang-tidy-review/upload@v0.13.0

From b23fe8c9c78d066461c81447566844ecf22a4a8e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 21:09:47 +0300
Subject: [PATCH 03/49] mtl : adapt the MNIST example as starter

---
 examples/mtl/CMakeLists.txt |  22 +++
 examples/mtl/mtl.cpp        |  51 ++++++
 examples/mtl/mtl.h          |  28 +++
 examples/mtl/mtl.m          | 357 ++++++++++++++++++++++++++++++++++++
 4 files changed, 458 insertions(+)
 create mode 100644 examples/mtl/mtl.cpp
 create mode 100644 examples/mtl/mtl.h
 create mode 100644 examples/mtl/mtl.m

diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
index 4dc0bc596bfe5..a8923405f4c86 100644
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@@ -2,6 +2,28 @@ set(TARGET mtl-export)
 add_executable(${TARGET} mtl-export.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
 if(TARGET BUILD_INFO)
   add_dependencies(${TARGET} BUILD_INFO)
 endif()
+
+if (APPLE)
+    #
+    # mtl
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(TEST_TARGET mtl)
+    add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
+    target_link_libraries(${TEST_TARGET} PRIVATE
+        ggml
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+    )
+endif()
+
diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
new file mode 100644
index 0000000000000..68e828d4eb616
--- /dev/null
+++ b/examples/mtl/mtl.cpp
@@ -0,0 +1,51 @@
+#include "ggml.h"
+#include "mtl.h"
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
+        return -1;
+    }
+
+    const char * fname_cgraph = argv[1];
+
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = 1;
+
+    // allocate work context
+    static size_t buf_size = gf.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    // this allocates all Metal resources and memory buffers
+    auto * ctx_mtl = llama_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+    // the actual inference happens here
+    llama_mtl_eval(ctx_mtl, &gf);
+
+    llama_mtl_free(ctx_mtl);
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return 0;
+}
+
diff --git a/examples/mtl/mtl.h b/examples/mtl/mtl.h
new file mode 100644
index 0000000000000..a40d5711100fd
--- /dev/null
+++ b/examples/mtl/mtl.h
@@ -0,0 +1,28 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * llama_mtl_init(
+        struct ggml_context * ctx_data,
+        struct ggml_context * ctx_eval,
+        struct ggml_context * ctx_work,
+        struct ggml_cgraph  * gf);
+
+void llama_mtl_free(struct ggml_mtl_context * ctx);
+
+// return 0 on success
+int llama_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
new file mode 100644
index 0000000000000..58f1f0371e6bf
--- /dev/null
+++ b/examples/mtl/mtl.m
@@ -0,0 +1,357 @@
+#import "mtl.h"
+
+#import "ggml.h"
+
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+struct ggml_mtl_context {
+    struct ggml_context * ctx_data;
+    struct ggml_context * ctx_eval;
+    struct ggml_context * ctx_work;
+
+    id<MTLDevice>       device;
+    id<MTLCommandQueue> queue;
+    id<MTLLibrary>      library;
+
+    id<MTLBuffer> buffer_data;
+    id<MTLBuffer> buffer_eval;
+
+    id<MTLBuffer> out;
+
+    // custom kernels
+    id<MTLFunction>             function_add;
+    id<MTLComputePipelineState> pipeline_add;
+
+    id<MTLFunction>             function_relu;
+    id<MTLComputePipelineState> pipeline_relu;
+
+    id<MTLFunction>             function_soft_max;
+    id<MTLComputePipelineState> pipeline_soft_max;
+};
+
+// MSL code
+NSString * const msl_library_llama = @"\
+#include <metal_stdlib>                                                                 \n\
+using namespace metal;                                                                  \n\
+                                                                                        \n\
+#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
+                                                                                        \n\
+constant int k_digits [[function_constant(0)]];                                         \n\
+                                                                                        \n\
+kernel void kernel_add(                                                                 \n\
+        device const float * src0,                                                      \n\
+        device const float * src1,                                                      \n\
+        device float * dst,                                                             \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = src0[gid] + src1[gid];                                                   \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_relu(                                                                \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = max(0.0f, src[gid]);                                                     \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_soft_max(                                                            \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    float max = 0.0f;                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        max = MAX(max, src[i]);                                                         \n\
+    }                                                                                   \n\
+    float sum = 0.0f;                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        dst[i] = exp(src[i] - max);                                                     \n\
+        sum += dst[i];                                                                  \n\
+    }                                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        dst[i] /= sum;                                                                  \n\
+    }                                                                                   \n\
+}                                                                                       \n\
+";
+
+struct ggml_mtl_context * llama_mtl_init(
+    struct ggml_context * ctx_data,
+    struct ggml_context * ctx_eval,
+    struct ggml_context * ctx_work,
+    struct ggml_cgraph  * gf) {
+    fprintf(stderr, "%s: allocating\n", __func__);
+
+    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
+
+    ctx->ctx_data = ctx_data;
+    ctx->ctx_eval = ctx_eval;
+    ctx->ctx_work = ctx_work;
+
+    ctx->device = MTLCreateSystemDefaultDevice();
+    ctx->queue  = [ctx->device newCommandQueue];
+
+    // determine if we can use MPS
+    if (MPSSupportsMTLDevice(ctx->device)) {
+        fprintf(stderr, "%s: using MPS\n", __func__);
+    } else {
+        fprintf(stderr, "%s: not using MPS\n", __func__);
+        GGML_ASSERT(false && "MPS not supported");
+    }
+
+    // compile from source string and show compile log
+    {
+        NSError * error = nil;
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+
+    // load kernels
+    {
+        const int k_digits = 123;
+
+        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
+        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
+
+        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
+        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+
+        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
+        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+
+        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
+        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+    }
+
+    // MTLBuffer approach
+
+    // pin ctx_data memory to GPU
+    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+    // TODO: how to use MTLStorageModeManaged?
+    // TODO: see if we can avoid this copy somehow
+    {
+        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+
+        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
+    }
+
+    // pin ctx_eval memory to GPU
+    // this buffer will be used for the intermediate results of the evaluation
+    {
+        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
+
+        fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
+    }
+
+    // allocate buffer for result extraction
+    {
+        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
+
+        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated  out buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
+    }
+
+    return ctx;
+}
+
+void llama_mtl_free(struct ggml_mtl_context * ctx) {
+    fprintf(stderr, "%s: deallocating\n", __func__);
+
+    free(ctx);
+}
+
+// get data / eval buffer + offset
+id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+    const size_t t_size = ggml_nbytes(t);
+    const size_t t_offs = is_data ? offs_data : offs_eval;
+
+    id<MTLBuffer> result;
+
+    if (is_data) {
+        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_data;
+    } else {
+        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_eval;
+    }
+
+    if (result == nil) {
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        GGML_ASSERT(false);
+    }
+
+    if (offs != nil) {
+        *offs = t_offs;
+    }
+
+    return result;
+}
+
+int llama_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf) {
+    fprintf(stderr, "%s: evaluating\n", __func__);
+
+    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = nil;
+
+    size_t offs_src0;
+    size_t offs_src1;
+    size_t offs_dst;
+
+    // copy the input data to the GPU
+    {
+        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+
+        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
+
+        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+    }
+
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+        switch (gf->nodes[i]->op) {
+            case GGML_OP_ADD:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_add];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_RELU:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_SOFT_MAX:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_MUL_MAT:
+                {
+                    if (encoder != nil) {
+                        [encoder endEncoding];
+                        encoder = nil;
+                    }
+
+                    // use MPSMatrixMultiplication
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
+                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
+
+                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
+                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
+
+                    const int64_t ncols2 = gf->nodes[i]->ne[0];
+                    const int64_t nrows2 = gf->nodes[i]->ne[1];
+
+                    GGML_ASSERT(ncols0 == ncols1);
+
+                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
+                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
+                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
+
+                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
+                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
+
+                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                } break;
+            default:
+                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                GGML_ASSERT(false);
+                return -1;
+        }
+    }
+
+    // extract results from the GPU
+    {
+        if (encoder != nil) {
+            [encoder endEncoding];
+            encoder = nil;
+        }
+
+        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
+
+        id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_dst = ctx->out;
+
+        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
+        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
+        [encoder_blit endEncoding];
+    }
+
+    [command_buffer commit];
+    [command_buffer waitUntilCompleted];
+
+    {
+        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
+        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
+    }
+
+    // TODO
+    const float * logits = ctx->out.contents;
+
+    return 0;
+}

From a792cbd0fc74bb214c8fb3c339e0e92ff6969898 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 21:28:59 +0300
Subject: [PATCH 04/49] mtl : no need for mtl-export tool, add cli arg for main
 instead

---
 examples/common.cpp         |  3 +++
 examples/common.h           |  1 +
 examples/main/main.cpp      |  7 +++++++
 examples/mtl/CMakeLists.txt |  9 ---------
 examples/mtl/mtl-export.cpp | 25 -------------------------
 5 files changed, 11 insertions(+), 34 deletions(-)
 delete mode 100644 examples/mtl/mtl-export.cpp

diff --git a/examples/common.cpp b/examples/common.cpp
index 32247cef77f59..b5810f28f4901 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -299,6 +299,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mmap = false;
         } else if (arg == "--mtest") {
             params.mem_test = true;
+        } else if (arg == "--export") {
+            params.export_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -438,6 +440,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        number of layers to store in VRAM\n");
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
+    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
diff --git a/examples/common.h b/examples/common.h
index fea9aa81a355a..66bdeb5e9287d 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -71,6 +71,7 @@ struct gpt_params {
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
+    bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6131f5b467304..552f8b38d6042 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -134,6 +134,13 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+
+        return 0;
+    }
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
index a8923405f4c86..c532a5582e2ea 100644
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@@ -1,12 +1,3 @@
-set(TARGET mtl-export)
-add_executable(${TARGET} mtl-export.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
 if (APPLE)
     #
     # mtl
diff --git a/examples/mtl/mtl-export.cpp b/examples/mtl/mtl-export.cpp
deleted file mode 100644
index 7872182a111bb..0000000000000
--- a/examples/mtl/mtl-export.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    llama_init_backend();
-
-    llama_context * ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    llama_eval_export(ctx, "llama.ggml");
-
-    llama_print_timings(ctx);
-    llama_free(ctx);
-
-    return 0;
-}

From 897d6d8e8ff94f9dbd4d02a1fb4e13a43b2f9530 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 21:40:05 +0300
Subject: [PATCH 05/49] mtl : export just a small part of the graph for now to
 make it easier

---
 examples/mtl/mtl.m | 12 ++++++------
 llama.cpp          | 20 +++++++++++++++++---
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 58f1f0371e6bf..47bbdb4ad0183 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -117,15 +117,15 @@ kernel void kernel_soft_max(
 
         ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
         ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
 
         ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
         ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
 
         ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
         ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
     }
 
     // MTLBuffer approach
@@ -217,11 +217,11 @@ int llama_mtl_eval(
 
     // copy the input data to the GPU
     {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
 
-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
+        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
 
-        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
     }
 
     for (int i = 0; i < gf->n_nodes; ++i) {
diff --git a/llama.cpp b/llama.cpp
index 9dccf0ed15362..e6d544615f3be 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1243,6 +1243,10 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
+    // TODO: TMP !!!
+    ggml_cgraph gf_export = {};
+    gf_export.n_threads = 1;
+
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1264,6 +1268,11 @@ static bool llama_eval_internal(
             cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
         }
 
+        // TODO: TMP !!!!
+        if (il == 0) {
+            ggml_set_name(cur, "mtl-check");
+        }
+
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -1420,12 +1429,17 @@ static bool llama_eval_internal(
     // logits -> probs
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
+    // TODO: TMP !!!!!!!!!!!!!!!!!!!!
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    //ggml_build_forward_expand(&gf, inpL);
+    //ggml_graph_compute       (ctx0, &gf);
+
+    // lets export a smaller graph to get things rolling -- baby steps first
+    ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
 
     if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        //ggml_graph_export(&gf, cgraph_fname);
+        ggml_graph_export(&gf_export, cgraph_fname);
     }
 
 #ifdef GGML_PERF

From 248a8c3379f211373b7a74d5f7eb2127cbce5d18 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 22:26:40 +0300
Subject: [PATCH 06/49] mtl : move MSL code into separate file for easy editing

---
 examples/mtl/CMakeLists.txt | 19 +++++++--
 examples/mtl/mtl.m          | 79 ++++++++++++++++++-------------------
 examples/mtl/mtl.metal      | 40 +++++++++++++++++++
 3 files changed, 94 insertions(+), 44 deletions(-)
 create mode 100644 examples/mtl/mtl.metal

diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
index c532a5582e2ea..1de83a1b62fd4 100644
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@@ -2,9 +2,9 @@ if (APPLE)
     #
     # mtl
 
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
     find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
 
     set(TEST_TARGET mtl)
@@ -16,5 +16,18 @@ if (APPLE)
         ${METALKIT_FRAMEWORK}
         ${METALPERFORMANCE_FRAMEWORK}
     )
+
+    # TODO: temporary until the kernels are ready
+    # custom command to build mtl.metal into a library
+    # depends on the mtl.metal file
+    add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
+
+    add_custom_command(
+        OUTPUT  ${CMAKE_BINARY_DIR}/mtl.metallib
+        COMMAND xcrun -sdk macosx metal    -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
+        COMMAND xcrun -sdk macosx metallib            ${CMAKE_BINARY_DIR}/mtl.air   -o ${CMAKE_BINARY_DIR}/mtl.metallib
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
+        COMMENT "Building mtl.metallib"
+    )
 endif()
 
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 47bbdb4ad0183..86e0b0c784f91 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -32,47 +32,9 @@
 };
 
 // MSL code
-NSString * const msl_library_llama = @"\
-#include <metal_stdlib>                                                                 \n\
-using namespace metal;                                                                  \n\
-                                                                                        \n\
-#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
-                                                                                        \n\
-constant int k_digits [[function_constant(0)]];                                         \n\
-                                                                                        \n\
-kernel void kernel_add(                                                                 \n\
-        device const float * src0,                                                      \n\
-        device const float * src1,                                                      \n\
-        device float * dst,                                                             \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = src0[gid] + src1[gid];                                                   \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_relu(                                                                \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = max(0.0f, src[gid]);                                                     \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_soft_max(                                                            \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    float max = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        max = MAX(max, src[i]);                                                         \n\
-    }                                                                                   \n\
-    float sum = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] = exp(src[i] - max);                                                     \n\
-        sum += dst[i];                                                                  \n\
-    }                                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] /= sum;                                                                  \n\
-    }                                                                                   \n\
-}                                                                                       \n\
-";
+// TODO: move the contents here when ready
+//       for now it is easier to work in a separate file
+NSString * const msl_library_llama = @"see mtl.metal";
 
 struct ggml_mtl_context * llama_mtl_init(
     struct ggml_context * ctx_data,
@@ -98,15 +60,50 @@ kernel void kernel_soft_max(
         GGML_ASSERT(false && "MPS not supported");
     }
 
+#if 0
     // compile from source string and show compile log
     {
         NSError * error = nil;
+
         ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
             exit(1);
         }
     }
+#elif 0
+    // this does not work !?!?!
+
+    // load library from "mtl.metallib"
+    {
+        NSError * error = nil;
+
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"./mtl" ofType:@"metallib"];
+        ctx->library = [ctx->device newLibraryWithFile:path error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+#else
+    // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource
+    {
+        NSError * error = nil;
+
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"../examples/mtl/mtl" ofType:@"metal"];
+        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+
+        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+#endif
 
     // load kernels
     {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
new file mode 100644
index 0000000000000..e9597336c2a80
--- /dev/null
+++ b/examples/mtl/mtl.metal
@@ -0,0 +1,40 @@
+#include <metal_stdlib>
+
+using namespace metal;
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+
+constant int k_digits [[function_constant(0)]];
+
+kernel void kernel_add(
+        device const float * src0,
+        device const float * src1,
+        device       float * dst,
+        uint gid[[thread_position_in_grid]]) {
+    dst[gid] = src0[gid] + src1[gid];
+}
+
+kernel void kernel_relu(
+        device const float * src,
+        device       float * dst,
+        uint gid[[thread_position_in_grid]]) {
+    dst[gid] = max(0.0f, src[gid]);
+}
+
+kernel void kernel_soft_max(
+        device const float * src,
+        device       float * dst,
+        uint gid[[thread_position_in_grid]]) {
+    float max = 0.0f;
+    for (int i = 0; i < k_digits; i++) {
+        max = MAX(max, src[i]);
+    }
+    float sum = 0.0f;
+    for (int i = 0; i < k_digits; i++) {
+        dst[i] = exp(src[i] - max);
+        sum += dst[i];
+    }
+    for (int i = 0; i < k_digits; i++) {
+        dst[i] /= sum;
+    }
+}

From a8fd9dc12870c0c828c200f599e005ee1989148f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 23:12:19 +0300
Subject: [PATCH 07/49] mtl : initial get_rows_q4_0 kernel

---
 examples/mtl/mtl.m     | 41 ++++++++++++++++++++++++++---
 examples/mtl/mtl.metal | 59 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 86e0b0c784f91..822edec207238 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -29,6 +29,9 @@
 
     id<MTLFunction>             function_soft_max;
     id<MTLComputePipelineState> pipeline_soft_max;
+
+    id<MTLFunction>             function_get_rows_q4_0;
+    id<MTLComputePipelineState> pipeline_get_rows_q4_0;
 };
 
 // MSL code
@@ -90,7 +93,7 @@
     {
         NSError * error = nil;
 
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"../examples/mtl/mtl" ofType:@"metal"];
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
         NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -107,10 +110,7 @@
 
     // load kernels
     {
-        const int k_digits = 123;
-
         MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
-        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
 
         ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
         ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
@@ -123,6 +123,10 @@
         ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
         ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
         fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
+
+        ctx->function_get_rows_q4_0 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_0"];
+        ctx->pipeline_get_rows_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_0 error:nil];
+        fprintf(stderr, "%s: loaded kernel_get_rows_q4_0: %p\n", __func__, (void *) ctx->pipeline_get_rows_q4_0);
     }
 
     // MTLBuffer approach
@@ -315,6 +319,35 @@ int llama_mtl_eval(
 
                     [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
                 } break;
+            case GGML_OP_GET_ROWS:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    switch (gf->nodes[i]->src0->type) {
+                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
+                        default: {
+                                     // not implemented
+                                     fprintf(stderr, "%s: node %3d, op = %8s, type = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op), ggml_type_name(gf->nodes[i]->src0->type));
+                                 }
+                    }
+
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                    [encoder setBytes:&(gf->nodes[i]->src0->ne[0]) length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&(gf->nodes[i]->src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
+                    [encoder setBytes:&(gf->nodes[i]->nb[1])       length:sizeof(uint64_t) atIndex:5];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]->src1);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                 GGML_ASSERT(false);
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index e9597336c2a80..33370fd6a6c01 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -4,7 +4,35 @@ using namespace metal;
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 
-constant int k_digits [[function_constant(0)]];
+#define QK4_0 32
+#define QR4_0 2
+typedef struct {
+    half    d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+
+static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) {
+    const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const half d = x[i].d;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+// TODO: not needed
+constant int nsoftmax [[function_constant(0)]];
 
 kernel void kernel_add(
         device const float * src0,
@@ -21,20 +49,39 @@ kernel void kernel_relu(
     dst[gid] = max(0.0f, src[gid]);
 }
 
+// TODO: broken
 kernel void kernel_soft_max(
         device const float * src,
-        device       float * dst,
-        uint gid[[thread_position_in_grid]]) {
+        device       float * dst) {
     float max = 0.0f;
-    for (int i = 0; i < k_digits; i++) {
+    for (int i = 0; i < nsoftmax; i++) {
         max = MAX(max, src[i]);
     }
     float sum = 0.0f;
-    for (int i = 0; i < k_digits; i++) {
+    for (int i = 0; i < nsoftmax; i++) {
         dst[i] = exp(src[i] - max);
         sum += dst[i];
     }
-    for (int i = 0; i < k_digits; i++) {
+    for (int i = 0; i < nsoftmax; i++) {
         dst[i] /= sum;
     }
 }
+
+// TODO: not tested
+kernel void kernel_get_rows_q4_0(
+        device const  void * src0,
+        device const   int * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb1,
+        uint gid[[thread_position_in_grid]]) {
+    device const block_q4_0 * src = (device const block_q4_0 *)src0;
+
+    const int i = gid;
+    const int r = ((device int32_t *) src1)[i];
+
+    dequantize_row_q4_0(
+            (device const block_q4_0 *) ((device char *) src0 + r*nb01),
+                       (device float *) ((device char *)  dst + i*nb1), ne00);
+}

From 794704e409a17f98755ff2ae83f106e3e2d069b7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 18:41:21 +0300
Subject: [PATCH 08/49] mtl : confirmed get_rows_q4_0 is working correctly

---
 examples/mtl/mtl.cpp   | 13 +++++++++++++
 examples/mtl/mtl.m     | 23 +++++++++++++++++++++++
 examples/mtl/mtl.metal |  1 -
 llama.cpp              | 27 ++++++++++++++++++++++++---
 4 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 68e828d4eb616..e15a1b02e36ca 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -5,6 +5,8 @@
 #include <cstring>
 #include <cstdlib>
 
+#include <vector> // tmp
+
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -37,6 +39,17 @@ int main(int argc, char ** argv) {
     // this allocates all Metal resources and memory buffers
     auto * ctx_mtl = llama_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
 
+    // TODO: tmp to match the input used when creating the cgraph
+    {
+        const int n_ctx   = 128;
+        const int n_batch = 32;
+
+        const std::vector<int> tmp(n_batch, 1); // BOS
+
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
+        memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
+    }
+
     // the actual inference happens here
     llama_mtl_eval(ctx_mtl, &gf);
 
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 822edec207238..b3f21f34708f3 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -357,6 +357,8 @@ int llama_mtl_eval(
 
     // extract results from the GPU
     {
+        fprintf(stderr, "%s: extract results from the GPU\n", __func__);
+
         if (encoder != nil) {
             [encoder endEncoding];
             encoder = nil;
@@ -367,6 +369,8 @@ int llama_mtl_eval(
         id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
         id<MTLBuffer> id_dst = ctx->out;
 
+        printf("XXXXX n = %d\n", ggml_nelements(out));
+
         id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
         [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
         [encoder_blit endEncoding];
@@ -383,5 +387,24 @@ int llama_mtl_eval(
     // TODO
     const float * logits = ctx->out.contents;
 
+    {
+        struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
+        float * data = (float *) ctx->out.contents;
+        printf("data: ");
+        int n = t->ne[0];
+        if (n > 10) {
+            n = 10;
+        }
+        for (int i = 0; i < n; i++) {
+            printf("%f ", data[i]);
+        }
+        printf("\n");
+        double sum = 0.0;
+        for (int i = 0; i < ggml_nelements(t); i++) {
+            sum += data[i];
+        }
+        printf("sum:  %f\n", sum);
+    }
+
     return 0;
 }
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 33370fd6a6c01..01ffec018a839 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -67,7 +67,6 @@ kernel void kernel_soft_max(
     }
 }
 
-// TODO: not tested
 kernel void kernel_get_rows_q4_0(
         device const  void * src0,
         device const   int * src1,
diff --git a/llama.cpp b/llama.cpp
index e6d544615f3be..c5ea19ac9df17 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1252,6 +1252,7 @@ static bool llama_eval_internal(
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+    ggml_set_name(inpL, "mtl-check");
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
@@ -1269,9 +1270,9 @@ static bool llama_eval_internal(
         }
 
         // TODO: TMP !!!!
-        if (il == 0) {
-            ggml_set_name(cur, "mtl-check");
-        }
+        //if (il == 0) {
+        //    ggml_set_name(cur, "mtl-check");
+        //}
 
         // self-attention
         {
@@ -1437,6 +1438,26 @@ static bool llama_eval_internal(
     // lets export a smaller graph to get things rolling -- baby steps first
     ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
 
+    // print
+    {
+        auto print_t = [&](struct ggml_tensor * t) {
+            float * data = (float *)t->data;
+            printf("data: ");
+            for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
+                printf("%f ", data[i]);
+            }
+            printf("\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++) {
+                sum += data[i];
+            }
+            printf("sum:  %f\n", sum);
+        };
+
+        ggml_graph_compute(ctx0, &gf_export);
+        print_t(ggml_get_tensor(ctx0, "mtl-check"));
+    }
+
     if (cgraph_fname) {
         //ggml_graph_export(&gf, cgraph_fname);
         ggml_graph_export(&gf_export, cgraph_fname);

From 72256ebd2ba04198938c3439d1a939cbc5a833e0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 19:03:04 +0300
Subject: [PATCH 09/49] mtl : add rms_norm kernel + confirm working

---
 examples/mtl/mtl.m     | 31 +++++++++++++++++++++++++++++++
 examples/mtl/mtl.metal | 35 ++++++++++++++++++++++++++++-------
 ggml.c                 |  4 ++--
 ggml.h                 |  1 +
 llama.cpp              | 10 ++++------
 5 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index b3f21f34708f3..ade0719baf6c6 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -32,6 +32,9 @@
 
     id<MTLFunction>             function_get_rows_q4_0;
     id<MTLComputePipelineState> pipeline_get_rows_q4_0;
+
+    id<MTLFunction>             function_rms_norm;
+    id<MTLComputePipelineState> pipeline_rms_norm;
 };
 
 // MSL code
@@ -127,6 +130,10 @@
         ctx->function_get_rows_q4_0 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_0"];
         ctx->pipeline_get_rows_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_0 error:nil];
         fprintf(stderr, "%s: loaded kernel_get_rows_q4_0: %p\n", __func__, (void *) ctx->pipeline_get_rows_q4_0);
+
+        ctx->function_rms_norm = [ctx->library newFunctionWithName:@"kernel_rms_norm"];
+        ctx->pipeline_rms_norm = [ctx->device newComputePipelineStateWithFunction:ctx->function_rms_norm error:nil];
+        fprintf(stderr, "%s: loaded kernel_rms_norm: %p\n", __func__, (void *) ctx->pipeline_rms_norm);
     }
 
     // MTLBuffer approach
@@ -348,6 +355,30 @@ int llama_mtl_eval(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+            case GGML_OP_RMS_NORM:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const  int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const    float eps  = 1e-6f;
+
+                    [encoder setComputePipelineState:ctx->pipeline_rms_norm];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                    [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
+                    [encoder setBytes:&eps  length:sizeof(  float) atIndex:4];
+
+                    const int64_t nrows = ggml_nrows(gf->nodes[i]->src0);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                 GGML_ASSERT(false);
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 01ffec018a839..6a736446b863a 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -43,23 +43,23 @@ kernel void kernel_add(
 }
 
 kernel void kernel_relu(
-        device const float * src,
+        device const float * src0,
         device       float * dst,
         uint gid[[thread_position_in_grid]]) {
-    dst[gid] = max(0.0f, src[gid]);
+    dst[gid] = max(0.0f, src0[gid]);
 }
 
 // TODO: broken
 kernel void kernel_soft_max(
-        device const float * src,
+        device const float * src0,
         device       float * dst) {
     float max = 0.0f;
     for (int i = 0; i < nsoftmax; i++) {
-        max = MAX(max, src[i]);
+        max = MAX(max, src0[i]);
     }
     float sum = 0.0f;
     for (int i = 0; i < nsoftmax; i++) {
-        dst[i] = exp(src[i] - max);
+        dst[i] = exp(src0[i] - max);
         sum += dst[i];
     }
     for (int i = 0; i < nsoftmax; i++) {
@@ -75,8 +75,6 @@ kernel void kernel_get_rows_q4_0(
         constant  uint64_t & nb01,
         constant  uint64_t & nb1,
         uint gid[[thread_position_in_grid]]) {
-    device const block_q4_0 * src = (device const block_q4_0 *)src0;
-
     const int i = gid;
     const int r = ((device int32_t *) src1)[i];
 
@@ -84,3 +82,26 @@ kernel void kernel_get_rows_q4_0(
             (device const block_q4_0 *) ((device char *) src0 + r*nb01),
                        (device float *) ((device char *)  dst + i*nb1), ne00);
 }
+
+kernel void kernel_rms_norm(
+        device const  void * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant     float & eps,
+        uint gid[[thread_position_in_grid]]) {
+    device const float * x = (device const float *) ((device const char *) src0 + gid*nb01);
+
+    float sum = 0.0f;
+    for (int i00 = 0; i00 < ne00; i00++) {
+        sum += x[i00] * x[i00];
+    }
+
+    const float mean  = sum/ne00;
+    const float scale = 1.0f/sqrt(mean + eps);
+
+    device float * y = dst + gid*ne00;
+    for (int i00 = 0; i00 < ne00; i00++) {
+        y[i00] = x[i00] * scale;
+    }
+}
diff --git a/ggml.c b/ggml.c
index 4cd0d72114a60..823d904eee870 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
     return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-int ggml_nrows(const struct ggml_tensor * tensor) {
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -9245,7 +9245,7 @@ static void ggml_compute_forward_rms_norm_f32(
                     sum += (ggml_float)(x[i00] * x[i00]);
                 }
 
-                float mean = sum/ne00;
+                const float mean = sum/ne00;
 
                 float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
 
diff --git a/ggml.h b/ggml.h
index 60c0ad8bfa1c0..1f033b4920af6 100644
--- a/ggml.h
+++ b/ggml.h
@@ -425,6 +425,7 @@ extern "C" {
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
     GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows    (const struct ggml_tensor * tensor);
     GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
 
     GGML_API int     ggml_blck_size (enum ggml_type type);
diff --git a/llama.cpp b/llama.cpp
index c5ea19ac9df17..3ee170e4c84ff 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1252,7 +1252,6 @@ static bool llama_eval_internal(
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
-    ggml_set_name(inpL, "mtl-check");
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
@@ -1264,16 +1263,15 @@ static bool llama_eval_internal(
         // norm
         {
             cur = ggml_rms_norm(ctx0, inpL);
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(cur, "mtl-check");
+            }
 
             // cur = cur*attention_norm(broadcasted)
             cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
         }
 
-        // TODO: TMP !!!!
-        //if (il == 0) {
-        //    ggml_set_name(cur, "mtl-check");
-        //}
-
         // self-attention
         {
             // compute Q and K and RoPE them

From 64afc0b53af82adb3a81e02ecf7574c51356664d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 19:15:38 +0300
Subject: [PATCH 10/49] mtl : add mul kernel + confirm working

---
 examples/mtl/mtl.m     | 31 ++++++++++++++++++++++++++++++-
 examples/mtl/mtl.metal | 11 +++++++++++
 llama.cpp              |  6 +++---
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index ade0719baf6c6..f13c0077629cd 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -24,6 +24,9 @@
     id<MTLFunction>             function_add;
     id<MTLComputePipelineState> pipeline_add;
 
+    id<MTLFunction>             function_mul;
+    id<MTLComputePipelineState> pipeline_mul;
+
     id<MTLFunction>             function_relu;
     id<MTLComputePipelineState> pipeline_relu;
 
@@ -119,6 +122,10 @@
         ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
         fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
 
+        ctx->function_mul = [ctx->library newFunctionWithName:@"kernel_mul"];
+        ctx->pipeline_mul = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul error:nil];
+        fprintf(stderr, "%s: loaded kernel_mul: %p\n", __func__, (void *) ctx->pipeline_mul);
+
         ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
         ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
         fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
@@ -253,6 +260,28 @@ int llama_mtl_eval(
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
 
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_MUL:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+
+                    [encoder setComputePipelineState:ctx->pipeline_mul];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
@@ -373,7 +402,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                     [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                    [encoder setBytes:&eps  length:sizeof(  float) atIndex:4];
+                    [encoder setBytes:&eps  length:sizeof(  float)  atIndex:4];
 
                     const int64_t nrows = ggml_nrows(gf->nodes[i]->src0);
 
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 6a736446b863a..78dfbe011da3f 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -42,6 +42,17 @@ kernel void kernel_add(
     dst[gid] = src0[gid] + src1[gid];
 }
 
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_mul(
+        device const float * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        uint gid[[thread_position_in_grid]]) {
+    dst[gid] = src0[gid] * src1[gid % ne00];
+}
+
 kernel void kernel_relu(
         device const float * src0,
         device       float * dst,
diff --git a/llama.cpp b/llama.cpp
index 3ee170e4c84ff..3ddfeff017c09 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1263,13 +1263,13 @@ static bool llama_eval_internal(
         // norm
         {
             cur = ggml_rms_norm(ctx0, inpL);
+
+            // cur = cur*attention_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
             // TODO: TMP !!!!
             if (il == 0) {
                 ggml_set_name(cur, "mtl-check");
             }
-
-            // cur = cur*attention_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
         }
 
         // self-attention

From 2a24994badb709c5833c1126974af4d3677a4f06 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 22:02:54 +0300
Subject: [PATCH 11/49] mtl : initial mul_mat Q4 kernel (wrong results)

---
 examples/mtl/mtl.m     |  48 ++++++++++++++++++-
 examples/mtl/mtl.metal | 102 +++++++++++++++++++++++++++++++++++------
 llama.cpp              |  13 ++++--
 3 files changed, 144 insertions(+), 19 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index f13c0077629cd..bd424c23d8dc1 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -38,6 +38,9 @@
 
     id<MTLFunction>             function_rms_norm;
     id<MTLComputePipelineState> pipeline_rms_norm;
+
+    id<MTLFunction>             function_mul_mat_q4_0;
+    id<MTLComputePipelineState> pipeline_mul_mat_q4_0;
 };
 
 // MSL code
@@ -141,6 +144,10 @@
         ctx->function_rms_norm = [ctx->library newFunctionWithName:@"kernel_rms_norm"];
         ctx->pipeline_rms_norm = [ctx->device newComputePipelineStateWithFunction:ctx->function_rms_norm error:nil];
         fprintf(stderr, "%s: loaded kernel_rms_norm: %p\n", __func__, (void *) ctx->pipeline_rms_norm);
+
+        ctx->function_mul_mat_q4_0 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0"];
+        ctx->pipeline_mul_mat_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0 error:nil];
+        fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0);
     }
 
     // MTLBuffer approach
@@ -317,7 +324,9 @@ int llama_mtl_eval(
                     [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_MUL_MAT:
-                {
+                if (gf->nodes[i]->src0->type == GGML_TYPE_F32) {
+                    // for F32 x F32 we use MPS
+
                     if (encoder != nil) {
                         [encoder endEncoding];
                         encoder = nil;
@@ -354,6 +363,43 @@ int llama_mtl_eval(
                         transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
 
                     [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                } else {
+                    // for Q4 x F32 we use custom kernel
+
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    GGML_ASSERT(gf->nodes[i]->src0->ne[2] == 1);
+                    GGML_ASSERT(gf->nodes[i]->src1->ne[2] == 1);
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
+                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
+
+                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
+                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
+
+                    const int64_t ncols = gf->nodes[i]->ne[0];
+                    const int64_t nrows = gf->nodes[i]->ne[1];
+
+                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                    [encoder setBytes:&ncols0 length:sizeof(ncols0) atIndex:3];
+                    [encoder setBytes:&nrows0 length:sizeof(nrows0) atIndex:4];
+                    [encoder setBytes:&ncols1 length:sizeof(ncols1) atIndex:5];
+                    [encoder setBytes:&nrows1 length:sizeof(nrows1) atIndex:6];
+                    [encoder setBytes:&ncols  length:sizeof(ncols)  atIndex:7];
+                    [encoder setBytes:&nrows  length:sizeof(nrows)  atIndex:8];
+
+                    printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ncols0, nrows0, ncols1, nrows1, ncols, nrows);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(nrows0, nrows1, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                 } break;
             case GGML_OP_GET_ROWS:
                 {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 78dfbe011da3f..f67d24f7108dd 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -7,8 +7,8 @@ using namespace metal;
 #define QK4_0 32
 #define QR4_0 2
 typedef struct {
-    half    d;              // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+    half    d;             // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
 } block_q4_0;
 
 static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) {
@@ -38,8 +38,8 @@ kernel void kernel_add(
         device const float * src0,
         device const float * src1,
         device       float * dst,
-        uint gid[[thread_position_in_grid]]) {
-    dst[gid] = src0[gid] + src1[gid];
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] + src1[tpig];
 }
 
 // assumption: src1 is a row
@@ -49,15 +49,15 @@ kernel void kernel_mul(
         device const float * src1,
         device       float * dst,
         constant   int64_t & ne00,
-        uint gid[[thread_position_in_grid]]) {
-    dst[gid] = src0[gid] * src1[gid % ne00];
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src1[tpig % ne00];
 }
 
 kernel void kernel_relu(
         device const float * src0,
         device       float * dst,
-        uint gid[[thread_position_in_grid]]) {
-    dst[gid] = max(0.0f, src0[gid]);
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = max(0.0f, src0[tpig]);
 }
 
 // TODO: broken
@@ -85,8 +85,8 @@ kernel void kernel_get_rows_q4_0(
         constant   int64_t & ne00,
         constant  uint64_t & nb01,
         constant  uint64_t & nb1,
-        uint gid[[thread_position_in_grid]]) {
-    const int i = gid;
+        uint tpig[[thread_position_in_grid]]) {
+    const int i = tpig;
     const int r = ((device int32_t *) src1)[i];
 
     dequantize_row_q4_0(
@@ -100,8 +100,8 @@ kernel void kernel_rms_norm(
         constant   int64_t & ne00,
         constant  uint64_t & nb01,
         constant     float & eps,
-        uint gid[[thread_position_in_grid]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + gid*nb01);
+        uint tpig[[thread_position_in_grid]]) {
+    device const float * x = (device const float *) ((device const char *) src0 + tpig*nb01);
 
     float sum = 0.0f;
     for (int i00 = 0; i00 < ne00; i00++) {
@@ -111,8 +111,84 @@ kernel void kernel_rms_norm(
     const float mean  = sum/ne00;
     const float scale = 1.0f/sqrt(mean + eps);
 
-    device float * y = dst + gid*ne00;
+    device float * y = dst + tpig*ne00;
     for (int i00 = 0; i00 < ne00; i00++) {
         y[i00] = x[i00] * scale;
     }
 }
+
+kernel void kernel_mul_mat_q4_0(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint2 tgpig[[threadgroup_position_in_grid]],
+        uint2  tpig[[thread_position_in_grid]],
+        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint2  tptg[[threads_per_threadgroup]]) {
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+
+    const int qk = QK4_0;
+    const int nb = ne00/qk;
+
+    device const block_q4_0 * x = (device const block_q4_0 *) (src0) + r0*nb;
+    device const float      * y = (device const float      *) (src1) + r1*ne10;
+
+    threadgroup float sum[32]; // TODO: should be equal to threadgroup size
+    sum[tpitg.x] = 0.0f;
+
+    for (int i = 0; i < nb; i += tptg.x) {
+        device const uint4  * x0p = (device const  uint4 *) (x + i);
+        device const float4 * y0p = (device const float4 *) (y + i*qk);
+
+        const uint4 x0 = *x0p;
+
+        const uint4 x0l = x0 & uint4(0x0F0F0F0F);
+        const uint4 x0h = x0 >> 4;
+
+        const int4 x0ls = as_type<int4>(x0l) - int4(8);
+        const int4 x0hs = as_type<int4>(x0h) - int4(8);
+
+        thread const uchar * x0lsb = (thread const uchar *) &x0ls;
+        thread const uchar * x0hsb = (thread const uchar *) &x0hs;
+
+        const float4 y00 = *(y0p + 0);
+        const float4 y01 = *(y0p + 1);
+        const float4 y02 = *(y0p + 2);
+        const float4 y03 = *(y0p + 3);
+        const float4 y04 = *(y0p + 4);
+        const float4 y05 = *(y0p + 5);
+        const float4 y06 = *(y0p + 6);
+        const float4 y07 = *(y0p + 7);
+
+        const float d = (x + i)->d;
+
+        sum[tpitg.x] += (
+                x0lsb[ 0]*y00[0] + x0lsb[ 1]*y00[1] + x0lsb[ 2]*y00[2] + x0lsb[ 3]*y00[3] +
+                x0lsb[ 4]*y01[0] + x0lsb[ 5]*y01[1] + x0lsb[ 6]*y01[2] + x0lsb[ 7]*y01[3] +
+                x0lsb[ 8]*y02[0] + x0lsb[ 9]*y02[1] + x0lsb[10]*y02[2] + x0lsb[11]*y02[3] +
+                x0lsb[12]*y03[0] + x0lsb[13]*y03[1] + x0lsb[14]*y03[2] + x0lsb[15]*y03[3] +
+                x0hsb[ 0]*y04[0] + x0hsb[ 1]*y04[1] + x0hsb[ 2]*y04[2] + x0hsb[ 3]*y04[3] +
+                x0hsb[ 4]*y05[0] + x0hsb[ 5]*y05[1] + x0hsb[ 6]*y05[2] + x0hsb[ 7]*y05[3] +
+                x0hsb[ 8]*y06[0] + x0hsb[ 9]*y06[1] + x0hsb[10]*y06[2] + x0hsb[11]*y06[3] +
+                x0hsb[12]*y07[0] + x0hsb[13]*y07[1] + x0hsb[14]*y07[2] + x0hsb[15]*y07[3]
+                ) * d;
+    }
+
+    // accumulate the sum from all threads in the threadgroup
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = tptg.x/2; i > 0; i /= 2) {
+        if (tpitg.x < i) {
+            sum[tpitg.x] += sum[tpitg.x + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    dst[r1*ne0 + r0] = sum[0];
+}
diff --git a/llama.cpp b/llama.cpp
index 3ddfeff017c09..caf74bfd17705 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1266,16 +1266,19 @@ static bool llama_eval_internal(
 
             // cur = cur*attention_norm(broadcasted)
             cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(cur, "mtl-check");
-            }
         }
 
         // self-attention
         {
+            auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(x, "mtl-check");
+            }
+
             // compute Q and K and RoPE them
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
             struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
             ggml_set_name(Qcur, "Qcur");
             ggml_set_name(Kcur, "Kcur");

From 96d005225fdb7803b1e1465623b935c2c878d8fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 22:13:43 +0300
Subject: [PATCH 12/49] mtl : mul_mat fixes (still wrong)

---
 examples/mtl/mtl.m     | 30 ++++++++++++++----------------
 examples/mtl/mtl.metal | 31 ++++++++++++++-----------------
 2 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index bd424c23d8dc1..8985a0d7453b5 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -377,29 +377,27 @@ int llama_mtl_eval(
                     id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
                     id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
-                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
-                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
-
-                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
-                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
-
-                    const int64_t ncols = gf->nodes[i]->ne[0];
-                    const int64_t nrows = gf->nodes[i]->ne[1];
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne10 = gf->nodes[i]->src1->ne[0];
+                    const int64_t ne11 = gf->nodes[i]->src1->ne[1];
+                    const int64_t ne0  = gf->nodes[i]->ne[0];
+                    const int64_t ne1  = gf->nodes[i]->ne[1];
 
                     [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&ncols0 length:sizeof(ncols0) atIndex:3];
-                    [encoder setBytes:&nrows0 length:sizeof(nrows0) atIndex:4];
-                    [encoder setBytes:&ncols1 length:sizeof(ncols1) atIndex:5];
-                    [encoder setBytes:&nrows1 length:sizeof(nrows1) atIndex:6];
-                    [encoder setBytes:&ncols  length:sizeof(ncols)  atIndex:7];
-                    [encoder setBytes:&nrows  length:sizeof(nrows)  atIndex:8];
+                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
+                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:5];
+                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
+                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
+                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
 
-                    printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ncols0, nrows0, ncols1, nrows1, ncols, nrows);
+                    printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ne00, ne01, ne10, ne11, ne0, ne1);
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(nrows0, nrows1, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                 } break;
             case GGML_OP_GET_ROWS:
                 {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index f67d24f7108dd..348a432ab8028 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -144,19 +144,16 @@ kernel void kernel_mul_mat_q4_0(
     sum[tpitg.x] = 0.0f;
 
     for (int i = 0; i < nb; i += tptg.x) {
-        device const uint4  * x0p = (device const  uint4 *) (x + i);
+        device const uint4  * x0p = (device const  uint4 *) (x + i)->qs;
         device const float4 * y0p = (device const float4 *) (y + i*qk);
 
         const uint4 x0 = *x0p;
 
-        const uint4 x0l = x0 & uint4(0x0F0F0F0F);
-        const uint4 x0h = x0 >> 4;
+        const uint4 x0l = (x0 & uint4(0x0F0F0F0F));
+        const uint4 x0h = (x0 & uint4(0xF0F0F0F0)) >> 4;
 
-        const int4 x0ls = as_type<int4>(x0l) - int4(8);
-        const int4 x0hs = as_type<int4>(x0h) - int4(8);
-
-        thread const uchar * x0lsb = (thread const uchar *) &x0ls;
-        thread const uchar * x0hsb = (thread const uchar *) &x0hs;
+        thread const char * x0lsb = (thread const char *) &x0l;
+        thread const char * x0hsb = (thread const char *) &x0h;
 
         const float4 y00 = *(y0p + 0);
         const float4 y01 = *(y0p + 1);
@@ -167,17 +164,17 @@ kernel void kernel_mul_mat_q4_0(
         const float4 y06 = *(y0p + 6);
         const float4 y07 = *(y0p + 7);
 
-        const float d = (x + i)->d;
+        const half d = (x + i)->d;
 
         sum[tpitg.x] += (
-                x0lsb[ 0]*y00[0] + x0lsb[ 1]*y00[1] + x0lsb[ 2]*y00[2] + x0lsb[ 3]*y00[3] +
-                x0lsb[ 4]*y01[0] + x0lsb[ 5]*y01[1] + x0lsb[ 6]*y01[2] + x0lsb[ 7]*y01[3] +
-                x0lsb[ 8]*y02[0] + x0lsb[ 9]*y02[1] + x0lsb[10]*y02[2] + x0lsb[11]*y02[3] +
-                x0lsb[12]*y03[0] + x0lsb[13]*y03[1] + x0lsb[14]*y03[2] + x0lsb[15]*y03[3] +
-                x0hsb[ 0]*y04[0] + x0hsb[ 1]*y04[1] + x0hsb[ 2]*y04[2] + x0hsb[ 3]*y04[3] +
-                x0hsb[ 4]*y05[0] + x0hsb[ 5]*y05[1] + x0hsb[ 6]*y05[2] + x0hsb[ 7]*y05[3] +
-                x0hsb[ 8]*y06[0] + x0hsb[ 9]*y06[1] + x0hsb[10]*y06[2] + x0hsb[11]*y06[3] +
-                x0hsb[12]*y07[0] + x0hsb[13]*y07[1] + x0hsb[14]*y07[2] + x0hsb[15]*y07[3]
+                (x0lsb[ 0] - 8)*y00[0] + (x0lsb[ 1] - 8)*y00[1] + (x0lsb[ 2] - 8)*y00[2] + (x0lsb[ 3] - 8)*y00[3] +
+                (x0lsb[ 4] - 8)*y01[0] + (x0lsb[ 5] - 8)*y01[1] + (x0lsb[ 6] - 8)*y01[2] + (x0lsb[ 7] - 8)*y01[3] +
+                (x0lsb[ 8] - 8)*y02[0] + (x0lsb[ 9] - 8)*y02[1] + (x0lsb[10] - 8)*y02[2] + (x0lsb[11] - 8)*y02[3] +
+                (x0lsb[12] - 8)*y03[0] + (x0lsb[13] - 8)*y03[1] + (x0lsb[14] - 8)*y03[2] + (x0lsb[15] - 8)*y03[3] +
+                (x0hsb[ 0] - 8)*y04[0] + (x0hsb[ 1] - 8)*y04[1] + (x0hsb[ 2] - 8)*y04[2] + (x0hsb[ 3] - 8)*y04[3] +
+                (x0hsb[ 4] - 8)*y05[0] + (x0hsb[ 5] - 8)*y05[1] + (x0hsb[ 6] - 8)*y05[2] + (x0hsb[ 7] - 8)*y05[3] +
+                (x0hsb[ 8] - 8)*y06[0] + (x0hsb[ 9] - 8)*y06[1] + (x0hsb[10] - 8)*y06[2] + (x0hsb[11] - 8)*y06[3] +
+                (x0hsb[12] - 8)*y07[0] + (x0hsb[13] - 8)*y07[1] + (x0hsb[14] - 8)*y07[2] + (x0hsb[15] - 8)*y07[3]
                 ) * d;
     }
 

From 29bec00ba06eb25fcd5948ca63f200780816ff1e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 22:31:07 +0300
Subject: [PATCH 13/49] mtl : another mul_mat Q4 (still does not work)

---
 examples/mtl/mtl.metal | 87 +++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 34 deletions(-)

diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 348a432ab8028..7eb9259e288ba 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -137,45 +137,64 @@ kernel void kernel_mul_mat_q4_0(
     const int qk = QK4_0;
     const int nb = ne00/qk;
 
-    device const block_q4_0 * x = (device const block_q4_0 *) (src0) + r0*nb;
-    device const float      * y = (device const float      *) (src1) + r1*ne10;
+    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
 
     threadgroup float sum[32]; // TODO: should be equal to threadgroup size
     sum[tpitg.x] = 0.0f;
 
     for (int i = 0; i < nb; i += tptg.x) {
-        device const uint4  * x0p = (device const  uint4 *) (x + i)->qs;
-        device const float4 * y0p = (device const float4 *) (y + i*qk);
-
-        const uint4 x0 = *x0p;
-
-        const uint4 x0l = (x0 & uint4(0x0F0F0F0F));
-        const uint4 x0h = (x0 & uint4(0xF0F0F0F0)) >> 4;
-
-        thread const char * x0lsb = (thread const char *) &x0l;
-        thread const char * x0hsb = (thread const char *) &x0h;
-
-        const float4 y00 = *(y0p + 0);
-        const float4 y01 = *(y0p + 1);
-        const float4 y02 = *(y0p + 2);
-        const float4 y03 = *(y0p + 3);
-        const float4 y04 = *(y0p + 4);
-        const float4 y05 = *(y0p + 5);
-        const float4 y06 = *(y0p + 6);
-        const float4 y07 = *(y0p + 7);
-
-        const half d = (x + i)->d;
-
-        sum[tpitg.x] += (
-                (x0lsb[ 0] - 8)*y00[0] + (x0lsb[ 1] - 8)*y00[1] + (x0lsb[ 2] - 8)*y00[2] + (x0lsb[ 3] - 8)*y00[3] +
-                (x0lsb[ 4] - 8)*y01[0] + (x0lsb[ 5] - 8)*y01[1] + (x0lsb[ 6] - 8)*y01[2] + (x0lsb[ 7] - 8)*y01[3] +
-                (x0lsb[ 8] - 8)*y02[0] + (x0lsb[ 9] - 8)*y02[1] + (x0lsb[10] - 8)*y02[2] + (x0lsb[11] - 8)*y02[3] +
-                (x0lsb[12] - 8)*y03[0] + (x0lsb[13] - 8)*y03[1] + (x0lsb[14] - 8)*y03[2] + (x0lsb[15] - 8)*y03[3] +
-                (x0hsb[ 0] - 8)*y04[0] + (x0hsb[ 1] - 8)*y04[1] + (x0hsb[ 2] - 8)*y04[2] + (x0hsb[ 3] - 8)*y04[3] +
-                (x0hsb[ 4] - 8)*y05[0] + (x0hsb[ 5] - 8)*y05[1] + (x0hsb[ 6] - 8)*y05[2] + (x0hsb[ 7] - 8)*y05[3] +
-                (x0hsb[ 8] - 8)*y06[0] + (x0hsb[ 9] - 8)*y06[1] + (x0hsb[10] - 8)*y06[2] + (x0hsb[11] - 8)*y06[3] +
-                (x0hsb[12] - 8)*y07[0] + (x0hsb[13] - 8)*y07[1] + (x0hsb[14] - 8)*y07[2] + (x0hsb[15] - 8)*y07[3]
-                ) * d;
+        //device const uint4  * x0p = (device const  uint4 *) (x + i)->qs;
+        //device const float4 * y0p = (device const float4 *) (y + i*qk);
+
+        //const uint4 x0 = *x0p;
+
+        //const uint4 x0l = (x0 & uint4(0x0F0F0F0F));
+        //const uint4 x0h = (x0 & uint4(0xF0F0F0F0)) >> 4;
+
+        //thread const char * x0lsb = (thread const char *) &x0l;
+        //thread const char * x0hsb = (thread const char *) &x0h;
+
+        //const float4 y00 = *(y0p + 0);
+        //const float4 y01 = *(y0p + 1);
+        //const float4 y02 = *(y0p + 2);
+        //const float4 y03 = *(y0p + 3);
+        //const float4 y04 = *(y0p + 4);
+        //const float4 y05 = *(y0p + 5);
+        //const float4 y06 = *(y0p + 6);
+        //const float4 y07 = *(y0p + 7);
+
+        //const half d = (x + i)->d;
+
+        //sum[tpitg.x] += (
+        //        (x0lsb[ 0] - 8)*y00[0] + (x0lsb[ 1] - 8)*y00[1] + (x0lsb[ 2] - 8)*y00[2] + (x0lsb[ 3] - 8)*y00[3] +
+        //        (x0lsb[ 4] - 8)*y01[0] + (x0lsb[ 5] - 8)*y01[1] + (x0lsb[ 6] - 8)*y01[2] + (x0lsb[ 7] - 8)*y01[3] +
+        //        (x0lsb[ 8] - 8)*y02[0] + (x0lsb[ 9] - 8)*y02[1] + (x0lsb[10] - 8)*y02[2] + (x0lsb[11] - 8)*y02[3] +
+        //        (x0lsb[12] - 8)*y03[0] + (x0lsb[13] - 8)*y03[1] + (x0lsb[14] - 8)*y03[2] + (x0lsb[15] - 8)*y03[3] +
+        //        (x0hsb[ 0] - 8)*y04[0] + (x0hsb[ 1] - 8)*y04[1] + (x0hsb[ 2] - 8)*y04[2] + (x0hsb[ 3] - 8)*y04[3] +
+        //        (x0hsb[ 4] - 8)*y05[0] + (x0hsb[ 5] - 8)*y05[1] + (x0hsb[ 6] - 8)*y05[2] + (x0hsb[ 7] - 8)*y05[3] +
+        //        (x0hsb[ 8] - 8)*y06[0] + (x0hsb[ 9] - 8)*y06[1] + (x0hsb[10] - 8)*y06[2] + (x0hsb[11] - 8)*y06[3] +
+        //        (x0hsb[12] - 8)*y07[0] + (x0hsb[13] - 8)*y07[1] + (x0hsb[14] - 8)*y07[2] + (x0hsb[15] - 8)*y07[3]
+        //        ) * d;
+
+        device const uchar * x0p = (device const uchar *) (x + i)->qs;
+        device const float * y0p = (device const float *) (y + i*qk);
+
+        float acc = 0.0f;
+
+        for (int j = 0; j < 16; ++j) {
+            const uchar x0v = *(x0p + j);
+
+            const int x0 = x0v & 0x0F;
+            const int x1 = x0v >>   4;
+
+            const float y0 = *(y0p + j);
+            const float y1 = *(y0p + j + 16);
+
+            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
+        }
+
+        sum[tpitg.x] += acc * (x + i)->d;
     }
 
     // accumulate the sum from all threads in the threadgroup

From b2fd06c6aa00490d16ef206a76c04dfc9149e60b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 30 May 2023 23:06:49 +0300
Subject: [PATCH 14/49] mtl : working mul_mat q4

---
 examples/mtl/mtl.m     | 7 ++++---
 examples/mtl/mtl.metal | 8 +++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 8985a0d7453b5..e447dfcf623ec 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -389,11 +389,12 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                     [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
-                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:5];
+                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
                     [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
                     [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
                     [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
+                    [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                     printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ne00, ne01, ne10, ne11, ne0, ne1);
 
@@ -446,7 +447,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                     [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                    [encoder setBytes:&eps  length:sizeof(  float)  atIndex:4];
+                    [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
 
                     const int64_t nrows = ggml_nrows(gf->nodes[i]->src0);
 
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 7eb9259e288ba..0cd93df7ffe8e 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -127,6 +127,7 @@ kernel void kernel_mul_mat_q4_0(
         constant   int64_t & ne11,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
+        threadgroup float  * sum [[threadgroup(0)]],
         uint2 tgpig[[threadgroup_position_in_grid]],
         uint2  tpig[[thread_position_in_grid]],
         uint2 tpitg[[thread_position_in_threadgroup]],
@@ -140,10 +141,9 @@ kernel void kernel_mul_mat_q4_0(
     device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
 
-    threadgroup float sum[32]; // TODO: should be equal to threadgroup size
     sum[tpitg.x] = 0.0f;
 
-    for (int i = 0; i < nb; i += tptg.x) {
+    for (int i = tpitg.x; i < nb; i += tptg.x) {
         //device const uint4  * x0p = (device const  uint4 *) (x + i)->qs;
         //device const float4 * y0p = (device const float4 *) (y + i*qk);
 
@@ -206,5 +206,7 @@ kernel void kernel_mul_mat_q4_0(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-    dst[r1*ne0 + r0] = sum[0];
+    if (tpitg.x == 0) {
+        dst[r1*ne0 + r0] = sum[0];
+    }
 }

From 6af6a0566361b32f8ded79d693e17de107fbb6a9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 31 May 2023 22:28:15 +0300
Subject: [PATCH 15/49] ggml : fix handling of "view" ops in
 ggml_graph_import()

---
 ggml.c | 78 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/ggml.c b/ggml.c
index 823d904eee870..fe1bc35f537a3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11156,7 +11156,7 @@ static void ggml_compute_forward_rope_f32(
                         theta *= theta_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
                         const float x0 = src[0];
                         const float x1 = src[1];
@@ -11177,7 +11177,7 @@ static void ggml_compute_forward_rope_f32(
                             const int64_t i0 = ib*n_dims + ic/2;
 
                             const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
                             const float x0 = src[0];
                             const float x1 = src[n_dims/2];
@@ -14970,6 +14970,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                 op     = *(const uint32_t *) ptr; ptr += sizeof(op);
                 n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
 
+                enum ggml_op eop = (enum ggml_op) op;
+
                 int64_t ne[GGML_MAX_DIMS];
                 size_t  nb[GGML_MAX_DIMS];
 
@@ -14984,42 +14986,62 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     nb[j] = nb_cur;
                 }
 
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
 
-                tensor->op = (enum ggml_op) op;
+                const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
 
-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
 
-                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
+                struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
 
                 // parse args
-                {
-                    struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
-                        &tensor->src0,
-                        &tensor->src1,
-                    };
+                for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                    const int32_t arg_idx = ptr_arg_idx[j];
 
-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = &tensor->opt[j];
+                    if (arg_idx == -1) {
+                        continue;
                     }
 
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
-                        const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+                    if (arg_idx < GGML_MAX_NODES) {
+                        args[j] = result.leafs[arg_idx];
+                    } else {
+                        args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+                    }
+                }
 
-                        if (arg_idx == -1) {
-                            continue;
-                        }
+                // create the tensor
+                // "view" operations are handled differently
 
-                        if (arg_idx < GGML_MAX_NODES) {
-                            *args[j] = result.leafs[arg_idx];
-                        } else {
-                            *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
-                        }
-                    }
+                struct ggml_tensor * tensor = NULL;
+
+                switch (eop) {
+                    // TODO: implement other view ops
+                    case GGML_OP_RESHAPE:
+                        {
+                            // TODO: implement other dims
+                            tensor = ggml_reshape_3d(*ctx_eval, args[0], ne[0], ne[1], ne[2]);
+                        } break;
+                    default:
+                        {
+                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                            tensor->op = eop;
+                        } break;
+                }
+
+
+                memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
+
+                // TODO: double-check this is needed
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                tensor->src0 = args[0];
+                tensor->src1 = args[1];
+
+                for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                    tensor->opt[j] = args[2 + j];
                 }
 
                 result.nodes[i] = tensor;

From 1213af76ceae9e839e1da440f95604c0a013d68d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 31 May 2023 22:28:59 +0300
Subject: [PATCH 16/49] mtl : add rope kernel

---
 examples/mtl/mtl.m     | 75 +++++++++++++++++++++++++++++++++++++++++-
 examples/mtl/mtl.metal | 55 +++++++++++++++++++++++++++++++
 llama.cpp              | 24 +++++++++-----
 3 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index e447dfcf623ec..a114841dd0b4a 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -41,6 +41,9 @@
 
     id<MTLFunction>             function_mul_mat_q4_0;
     id<MTLComputePipelineState> pipeline_mul_mat_q4_0;
+
+    id<MTLFunction>             function_rope;
+    id<MTLComputePipelineState> pipeline_rope;
 };
 
 // MSL code
@@ -148,6 +151,10 @@
         ctx->function_mul_mat_q4_0 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0"];
         ctx->pipeline_mul_mat_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0 error:nil];
         fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0);
+
+        ctx->function_rope = [ctx->library newFunctionWithName:@"kernel_rope"];
+        ctx->pipeline_rope = [ctx->device newComputePipelineStateWithFunction:ctx->function_rope error:nil];
+        fprintf(stderr, "%s: loaded kernel_rope: %p\n", __func__, (void *) ctx->pipeline_rope);
     }
 
     // MTLBuffer approach
@@ -250,6 +257,10 @@ int llama_mtl_eval(
         fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
         switch (gf->nodes[i]->op) {
+            case GGML_OP_RESHAPE:
+                {
+                    // noop
+                } break;
             case GGML_OP_ADD:
                 {
                     if (encoder == nil) {
@@ -453,6 +464,68 @@ int llama_mtl_eval(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+            case GGML_OP_ROPE:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
+
+                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
+                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
+                    const uint64_t nb03 = gf->nodes[i]->src0->nb[3];
+
+                    const int64_t ne0 = gf->nodes[i]->ne[0];
+                    const int64_t ne1 = gf->nodes[i]->ne[1];
+                    const int64_t ne2 = gf->nodes[i]->ne[2];
+                    const int64_t ne3 = gf->nodes[i]->ne[3];
+
+                    const uint64_t nb0 = gf->nodes[i]->nb[0];
+                    const uint64_t nb1 = gf->nodes[i]->nb[1];
+                    const uint64_t nb2 = gf->nodes[i]->nb[2];
+                    const uint64_t nb3 = gf->nodes[i]->nb[3];
+
+                    const int n_past = ((int32_t *) gf->nodes[i]->src1->data)[0]; // TODO: TMP !!!!!
+                    const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
+                    const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
+
+                    printf("rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
+
+                    [encoder setComputePipelineState:ctx->pipeline_rope];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                    [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                    [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                    [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                    [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                    [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                    [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                    [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                    [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                    [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                    [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                    [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                    [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                    [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                    [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                    [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
+                    [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
+                    [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                 GGML_ASSERT(false);
@@ -486,7 +559,7 @@ int llama_mtl_eval(
 
     {
         const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
+        fprintf(stderr, "%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
     }
 
     // TODO
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 0cd93df7ffe8e..a46d016fb84d8 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -210,3 +210,58 @@ kernel void kernel_mul_mat_q4_0(
         dst[r1*ne0 + r0] = sum[0];
     }
 }
+
+kernel void kernel_rope(
+        device const  void * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        constant       int & n_past,
+        constant       int & n_dims,
+        constant       int & mode,
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i3 = tpig[2];
+    const int64_t i2 = tpig[1];
+    const int64_t i1 = tpig[0];
+
+    const bool is_neox = mode & 2;
+    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+
+    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+
+    float theta = (float)p;
+
+    if (!is_neox) {
+        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[1];
+
+            dst_data[0] = x0*cos_theta - x1*sin_theta;
+            dst_data[1] = x0*sin_theta + x1*cos_theta;
+        }
+    } else {
+        // TODO: implement
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index caf74bfd17705..88cfe26ec977f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1270,19 +1270,20 @@ static bool llama_eval_internal(
 
         // self-attention
         {
-            auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(x, "mtl-check");
-            }
+            //auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
             // compute Q and K and RoPE them
-            //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
             struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
             ggml_set_name(Qcur, "Qcur");
             ggml_set_name(Kcur, "Kcur");
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(Qcur, "mtl-check");
+            }
+
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
@@ -1437,7 +1438,14 @@ static bool llama_eval_internal(
     //ggml_graph_compute       (ctx0, &gf);
 
     // lets export a smaller graph to get things rolling -- baby steps first
-    ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
+    {
+        struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
+        if (!t) {
+            fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
+            exit(1);
+        }
+        ggml_build_forward_expand(&gf_export, t);
+    }
 
     // print
     {

From 7ca81e9e6594f769c166a4f5fe92efe0b13832f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 31 May 2023 22:38:40 +0300
Subject: [PATCH 17/49] mtl : add reshape and transpose handling

---
 examples/mtl/mtl.m | 1 +
 ggml.c             | 8 ++++++--
 llama.cpp          | 9 ++++-----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index a114841dd0b4a..7e48e2b95a0ce 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -258,6 +258,7 @@ int llama_mtl_eval(
 
         switch (gf->nodes[i]->op) {
             case GGML_OP_RESHAPE:
+            case GGML_OP_TRANSPOSE:
                 {
                     // noop
                 } break;
diff --git a/ggml.c b/ggml.c
index fe1bc35f537a3..71a0d4f158c2b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15011,6 +15011,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
 
                 // create the tensor
                 // "view" operations are handled differently
+                // TODO: handle inplac ops - currentl a copy is always made
 
                 struct ggml_tensor * tensor = NULL;
 
@@ -15018,8 +15019,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     // TODO: implement other view ops
                     case GGML_OP_RESHAPE:
                         {
-                            // TODO: implement other dims
-                            tensor = ggml_reshape_3d(*ctx_eval, args[0], ne[0], ne[1], ne[2]);
+                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
+                        } break;
+                    case GGML_OP_TRANSPOSE:
+                        {
+                            tensor = ggml_transpose(*ctx_eval, args[0]);
                         } break;
                     default:
                         {
diff --git a/llama.cpp b/llama.cpp
index 88cfe26ec977f..fdbbca69f472f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1279,15 +1279,14 @@ static bool llama_eval_internal(
             ggml_set_name(Qcur, "Qcur");
             ggml_set_name(Kcur, "Kcur");
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(Qcur, "mtl-check");
-            }
-
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
+                // TODO: TMP !!!!
+                if (il == 0) {
+                    ggml_set_name(Vcur, "mtl-check");
+                }
 
                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,

From 94ea9e7bfecc1116afc830606cf886f1b3620257 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 19:21:08 +0300
Subject: [PATCH 18/49] ggml : store offset as opt arg for ggml_view_xd()
 operators

---
 ggml.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/ggml.c b/ggml.c
index 71a0d4f158c2b..7a3f74771f99a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5802,10 +5802,18 @@ struct ggml_tensor * ggml_view_1d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
+    result->opt[0] = offs;
 
     if (is_node) {
         memcpy(result->padding, &offset, sizeof(offset));
@@ -5834,6 +5842,13 @@ struct ggml_tensor * ggml_view_2d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = result->nb[1]*ne1;
     result->nb[3] = result->nb[2];
@@ -5842,6 +5857,7 @@ struct ggml_tensor * ggml_view_2d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
+    result->opt[0] = offs;
 
     if (is_node) {
         memcpy(result->padding, &offset, sizeof(offset));
@@ -5872,6 +5888,13 @@ struct ggml_tensor * ggml_view_3d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = nb2;
     result->nb[3] = result->nb[2]*ne2;
@@ -5880,6 +5903,7 @@ struct ggml_tensor * ggml_view_3d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
+    result->opt[0] = offs;
 
     if (is_node) {
         memcpy(result->padding, &offset, sizeof(offset));
@@ -5912,6 +5936,13 @@ struct ggml_tensor * ggml_view_4d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = nb2;
     result->nb[3] = nb3;
@@ -5920,6 +5951,7 @@ struct ggml_tensor * ggml_view_4d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
+    result->opt[0] = offs;
 
     if (is_node) {
         memcpy(result->padding, &offset, sizeof(offset));
@@ -15021,6 +15053,16 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         {
                             tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
                         } break;
+                    case GGML_OP_VIEW:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+
+                            uint64_t offs;
+                            memcpy(&offs, args[2]->data, sizeof(offs));
+
+                            tensor->data = ((char *) tensor->data) + offs;
+                            printf("xxxxxx offs: %zu\n", offs);
+                        } break;
                     case GGML_OP_TRANSPOSE:
                         {
                             tensor = ggml_transpose(*ctx_eval, args[0]);

From 948fcfde7e74dc770687da9f0ea738195b782ac4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 19:21:28 +0300
Subject: [PATCH 19/49] mtl : add cpy kernel + handle view ops

---
 examples/mtl/mtl.m     | 126 ++++++++++++++++++++++++++++++++++++-----
 examples/mtl/mtl.metal |  42 ++++++++++++++
 llama.cpp              |  44 +++++++++++---
 3 files changed, 191 insertions(+), 21 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 7e48e2b95a0ce..6d509a2abd943 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -44,6 +44,9 @@
 
     id<MTLFunction>             function_rope;
     id<MTLComputePipelineState> pipeline_rope;
+
+    id<MTLFunction>             function_cpy_f32_f16;
+    id<MTLComputePipelineState> pipeline_cpy_f32_f16;
 };
 
 // MSL code
@@ -155,6 +158,10 @@
         ctx->function_rope = [ctx->library newFunctionWithName:@"kernel_rope"];
         ctx->pipeline_rope = [ctx->device newComputePipelineStateWithFunction:ctx->function_rope error:nil];
         fprintf(stderr, "%s: loaded kernel_rope: %p\n", __func__, (void *) ctx->pipeline_rope);
+
+        ctx->function_cpy_f32_f16 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f16"];
+        ctx->pipeline_cpy_f32_f16 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f16 error:nil];
+        fprintf(stderr, "%s: loaded kernel_cpy_f32_f16: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f16);
     }
 
     // MTLBuffer approach
@@ -258,6 +265,7 @@ int llama_mtl_eval(
 
         switch (gf->nodes[i]->op) {
             case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
             case GGML_OP_TRANSPOSE:
                 {
                     // noop
@@ -527,6 +535,76 @@ int llama_mtl_eval(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+            case GGML_OP_CPY:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
+
+                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
+                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
+                    const uint64_t nb03 = gf->nodes[i]->src0->nb[3];
+
+                    const int64_t ne0 = gf->nodes[i]->ne[0];
+                    const int64_t ne1 = gf->nodes[i]->ne[1];
+                    const int64_t ne2 = gf->nodes[i]->ne[2];
+                    const int64_t ne3 = gf->nodes[i]->ne[3];
+
+                    const uint64_t nb0 = gf->nodes[i]->nb[0];
+                    const uint64_t nb1 = gf->nodes[i]->nb[1];
+                    const uint64_t nb2 = gf->nodes[i]->nb[2];
+                    const uint64_t nb3 = gf->nodes[i]->nb[3];
+
+                    const enum ggml_type src0t = gf->nodes[i]->src0->type;
+                    const enum ggml_type dstt  = gf->nodes[i]->type;
+
+                    printf("cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    printf("cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
+                    printf("cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    printf("cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
+                    printf("cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
+
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                switch (dstt) {
+                                    case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
+                                    default: GGML_ASSERT(false && "not implemented");
+                                };
+                            } break;
+                        default: GGML_ASSERT(false && "not implemented");
+                    }
+
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                    [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                    [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                    [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                    [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                    [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                    [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                    [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                    [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                    [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                    [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                    [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                    [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                    [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                    [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                 GGML_ASSERT(false);
@@ -568,21 +646,41 @@ int llama_mtl_eval(
 
     {
         struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
-        float * data = (float *) ctx->out.contents;
-        printf("data: ");
-        int n = t->ne[0];
-        if (n > 10) {
-            n = 10;
-        }
-        for (int i = 0; i < n; i++) {
-            printf("%f ", data[i]);
-        }
-        printf("\n");
-        double sum = 0.0;
-        for (int i = 0; i < ggml_nelements(t); i++) {
-            sum += data[i];
+        if (t->type == GGML_TYPE_F32) {
+            const const float * data = (float *) ctx->out.contents;
+            printf("data: ");
+            int n = ggml_nelements(t);
+            if (n > 10) {
+                n = 10;
+            }
+            for (int i = 0; i < n; i++) {
+                printf("%f ", data[i]);
+            }
+            printf("\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++) {
+                sum += data[i];
+            }
+            printf("sum:  %f\n", sum);
+        } else if (t->type == GGML_TYPE_F16) {
+            const ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
+            printf("data: ");
+            int n = ggml_nelements(t);
+            if (n > 10) {
+                n = 10;
+            }
+            for (int i = 0; i < n; i++) {
+                printf("%f ", ggml_fp16_to_fp32(data[i]));
+            }
+            printf("\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++) {
+                sum += ggml_fp16_to_fp32(data[i]);
+            }
+            printf("sum:  %f\n", sum);
+        } else {
+            GGML_ASSERT(false && "not implemented");
         }
-        printf("sum:  %f\n", sum);
     }
 
     return 0;
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index a46d016fb84d8..7e5c3aad42769 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -265,3 +265,45 @@ kernel void kernel_rope(
         // TODO: implement
     }
 }
+
+kernel void kernel_cpy_f32_f16(
+        device const float * src0,
+        device        half * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index fdbbca69f472f..5e7c3db861513 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1283,18 +1283,21 @@ static bool llama_eval_internal(
             {
                 // compute the transposed [N, n_embd] V matrix
                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
-                // TODO: TMP !!!!
-                if (il == 0) {
-                    ggml_set_name(Vcur, "mtl-check");
-                }
 
                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
                         (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
+                struct ggml_tensor * t = ggml_cpy(ctx0, Kcur, k);
+                // TODO: TMP !!!!
+                if (il == 0) {
+                    ggml_set_name(t, "mtl-check");
+                }
+
                 // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, t);
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
             }
 
@@ -1448,7 +1451,7 @@ static bool llama_eval_internal(
 
     // print
     {
-        auto print_t = [&](struct ggml_tensor * t) {
+        auto print_t_f32 = [&](struct ggml_tensor * t) {
             float * data = (float *)t->data;
             printf("data: ");
             for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
@@ -1461,9 +1464,36 @@ static bool llama_eval_internal(
             }
             printf("sum:  %f\n", sum);
         };
+        auto print_t_f16 = [&](struct ggml_tensor * t) {
+            ggml_fp16_t * data = (ggml_fp16_t *)t->data;
+            printf("data: ");
+            for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
+                printf("%f ", ggml_fp16_to_fp32(data[i]));
+            }
+            printf("\n");
+            double sum = 0.0;
+            for (int i = 0; i < ggml_nelements(t); i++) {
+                sum += ggml_fp16_to_fp32(data[i]);
+            }
+            printf("sum:  %f\n", sum);
+        };
 
         ggml_graph_compute(ctx0, &gf_export);
-        print_t(ggml_get_tensor(ctx0, "mtl-check"));
+
+        {
+            auto * t = ggml_get_tensor(ctx0, "mtl-check");
+            switch (t->type) {
+                case GGML_TYPE_F32:
+                    print_t_f32(t);
+                    break;
+                case GGML_TYPE_F16:
+                    print_t_f16(t);
+                    break;
+                default:
+                    fprintf(stderr, "%s: unsupported type\n", __func__);
+                    exit(1);
+            }
+        }
     }
 
     if (cgraph_fname) {

From 51efb59437cf26b2b77b8a07869462d4074cbe4e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 19:45:36 +0300
Subject: [PATCH 20/49] mtl : confirm f16 x f32 attention mul mat

---
 examples/mtl/mtl.m | 145 ++++++++++++++++++++++++++-------------------
 ggml.c             |   8 ++-
 llama.cpp          |  24 +++++---
 3 files changed, 106 insertions(+), 71 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 6d509a2abd943..1327de0b4eec2 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -267,6 +267,7 @@ int llama_mtl_eval(
             case GGML_OP_RESHAPE:
             case GGML_OP_VIEW:
             case GGML_OP_TRANSPOSE:
+            case GGML_OP_PERMUTE:
                 {
                     // noop
                 } break;
@@ -344,81 +345,101 @@ int llama_mtl_eval(
                     [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_MUL_MAT:
-                if (gf->nodes[i]->src0->type == GGML_TYPE_F32) {
-                    // for F32 x F32 we use MPS
-
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixMultiplication
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
-                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
-
-                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
-                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
-
-                    const int64_t ncols2 = gf->nodes[i]->ne[0];
-                    const int64_t nrows2 = gf->nodes[i]->ne[1];
-
-                    GGML_ASSERT(ncols0 == ncols1);
-
-                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
-                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
-                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
-
-                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
-                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
-
-                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                } else {
-                    // for Q4 x F32 we use custom kernel
-
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    GGML_ASSERT(gf->nodes[i]->src0->ne[2] == 1);
-                    GGML_ASSERT(gf->nodes[i]->src1->ne[2] == 1);
-
+                {
                     id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
                     id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
                     id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
                     const int64_t ne00 = gf->nodes[i]->src0->ne[0];
                     const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+
+                    //const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
+                    //const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
+
                     const int64_t ne10 = gf->nodes[i]->src1->ne[0];
                     const int64_t ne11 = gf->nodes[i]->src1->ne[1];
+                    const int64_t ne12 = gf->nodes[i]->src1->ne[2];
+
+                    //const uint64_t nb10 = gf->nodes[i]->src1->nb[0];
+                    //const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
+                    const uint64_t nb12 = gf->nodes[i]->src1->nb[2];
+
                     const int64_t ne0  = gf->nodes[i]->ne[0];
                     const int64_t ne1  = gf->nodes[i]->ne[1];
+                    const int64_t ne2  = gf->nodes[i]->ne[2];
 
-                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
-                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
-                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
-                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
-                    [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+                    //const uint64_t nb0 = gf->nodes[i]->nb[0];
+                    //const uint64_t nb1 = gf->nodes[i]->nb[1];
+                    const uint64_t nb2 = gf->nodes[i]->nb[2];
 
-                    printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ne00, ne01, ne10, ne11, ne0, ne1);
+                    const enum ggml_type src0t = gf->nodes[i]->src0->type;
+                    const enum ggml_type src1t = gf->nodes[i]->src1->type;
+                    const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    printf("mul_mat: src0 - %s[%lld, %lld, %lld]\n", ggml_type_name(src0t), ne00, ne01, ne02);
+                    printf("mul_mat: src1 - %s[%lld, %lld, %lld]\n", ggml_type_name(src1t), ne10, ne11, ne12);
+                    printf("mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
+                    printf("mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
+
+                    GGML_ASSERT(ne00 == ne10);
+                    GGML_ASSERT(ne02 == ne12);
+
+                    if (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) {
+                        if (encoder != nil) {
+                            [encoder endEncoding];
+                            encoder = nil;
+                        }
+
+                        MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+                        MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+
+                        // for F32 x F32 we use MPS
+                        MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:gf->nodes[i]->src0->nb[1] dataType:src0dt];
+
+                        MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:gf->nodes[i]->src1->nb[1] dataType:src1dt];
+
+                        MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                        MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
+                            initWithDevice:ctx->device transposeLeft:false transposeRight:true
+                                resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
+
+                        for (int64_t i02 = 0; i02 < ne02; ++i02) {
+                            size_t offs_src0_cur = offs_src0 + i02*nb02;
+                            size_t offs_src1_cur = offs_src1 + i02*nb12;
+                            size_t offs_dst_cur  = offs_dst  + i02*nb2;
+
+                            MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
+                            MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
+                            MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
+
+                            [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                        }
+                    } else {
+                        if (encoder == nil) {
+                            encoder = [command_buffer computeCommandEncoder];
+                        }
+
+                        // for Q4 x F32 we use custom kernel
+                        [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    }
                 } break;
             case GGML_OP_GET_ROWS:
                 {
diff --git a/ggml.c b/ggml.c
index 7a3f74771f99a..1141361228299 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14613,7 +14613,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
@@ -14627,7 +14627,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
@@ -15067,6 +15067,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         {
                             tensor = ggml_transpose(*ctx_eval, args[0]);
                         } break;
+                    case GGML_OP_PERMUTE:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+                        } break;
                     default:
                         {
                             tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
diff --git a/llama.cpp b/llama.cpp
index 5e7c3db861513..f6d93bd93060b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1289,16 +1289,22 @@ static bool llama_eval_internal(
                         (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
-                struct ggml_tensor * t = ggml_cpy(ctx0, Kcur, k);
-                // TODO: TMP !!!!
-                if (il == 0) {
-                    ggml_set_name(t, "mtl-check");
-                }
+                //struct ggml_tensor * t = ggml_cpy(ctx0, Vcur, v);
+                //// TODO: TMP !!!!
+                //if (il == 0) {
+                //    ggml_set_name(t, "mtl-check");
+                //}
 
                 // important: storing RoPE-ed version of K in the KV cache!
-                //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, t);
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                //ggml_build_forward_expand(&gf, t);
+
+                // TODO: TMP !!!!!!!!!!
+                if (il == 0) {
+                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
+                }
             }
 
             struct ggml_tensor * Q =
@@ -1318,6 +1324,10 @@ static bool llama_eval_internal(
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             ggml_set_name(KQ, "KQ");
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(KQ, "mtl-check");
+            }
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));

From 0f1c580860e2acbee7c095b113256f69e93869b5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 19:52:32 +0300
Subject: [PATCH 21/49] mtl : add scale kernel

---
 examples/mtl/mtl.m     | 27 +++++++++++++++++++++++++++
 examples/mtl/mtl.metal |  8 ++++++++
 llama.cpp              |  8 ++++----
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 1327de0b4eec2..8f55f8467d338 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -27,6 +27,9 @@
     id<MTLFunction>             function_mul;
     id<MTLComputePipelineState> pipeline_mul;
 
+    id<MTLFunction>             function_scale;
+    id<MTLComputePipelineState> pipeline_scale;
+
     id<MTLFunction>             function_relu;
     id<MTLComputePipelineState> pipeline_relu;
 
@@ -135,6 +138,10 @@
         ctx->pipeline_mul = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul error:nil];
         fprintf(stderr, "%s: loaded kernel_mul: %p\n", __func__, (void *) ctx->pipeline_mul);
 
+        ctx->function_scale = [ctx->library newFunctionWithName:@"kernel_scale"];
+        ctx->pipeline_scale = [ctx->device newComputePipelineStateWithFunction:ctx->function_scale error:nil];
+        fprintf(stderr, "%s: loaded kernel_scale: %p\n", __func__, (void *) ctx->pipeline_scale);
+
         ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
         ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
         fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
@@ -310,6 +317,26 @@ int llama_mtl_eval(
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
 
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_SCALE:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const float scale = *(const float *) gf->nodes[i]->src1->data;
+
+                    [encoder setComputePipelineState:ctx->pipeline_scale];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 7e5c3aad42769..b132be15e9da0 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -53,6 +53,14 @@ kernel void kernel_mul(
     dst[tpig] = src0[tpig] * src1[tpig % ne00];
 }
 
+kernel void kernel_scale(
+        device const float * src0,
+        device       float * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
 kernel void kernel_relu(
         device const float * src0,
         device       float * dst,
diff --git a/llama.cpp b/llama.cpp
index f6d93bd93060b..28d489016042f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1324,10 +1324,6 @@ static bool llama_eval_internal(
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             ggml_set_name(KQ, "KQ");
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(KQ, "mtl-check");
-            }
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
@@ -1336,6 +1332,10 @@ static bool llama_eval_internal(
             // KQ_scaled shape [n_past + N, N, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
             ggml_set_name(KQ_scaled, "KQ_scaled");
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(KQ_scaled, "mtl-check");
+            }
 
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

From 17a70362a69c415bc3b70e4f2dfab83afb26005a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 20:41:54 +0300
Subject: [PATCH 22/49] mtl : add diag_mask_inf kernel

---
 examples/mtl/mtl.cpp   |  9 +++-----
 examples/mtl/mtl.h     |  5 ++++-
 examples/mtl/mtl.m     | 51 ++++++++++++++++++++++++++++++++++--------
 examples/mtl/mtl.metal | 36 +++++++++++++++++++++++++++++
 ggml.c                 |  1 -
 llama.cpp              | 12 +++++-----
 6 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index e15a1b02e36ca..40e8fbceea39a 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -41,18 +41,15 @@ int main(int argc, char ** argv) {
 
     // TODO: tmp to match the input used when creating the cgraph
     {
-        const int n_ctx   = 128;
+        const int n_past  = 128;
         const int n_batch = 32;
 
         const std::vector<int> tmp(n_batch, 1); // BOS
 
-        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
-        memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
+        // the actual inference happens here
+        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
     }
 
-    // the actual inference happens here
-    llama_mtl_eval(ctx_mtl, &gf);
-
     llama_mtl_free(ctx_mtl);
 
     ggml_free(ctx_work);
diff --git a/examples/mtl/mtl.h b/examples/mtl/mtl.h
index a40d5711100fd..a6a336eaac5d6 100644
--- a/examples/mtl/mtl.h
+++ b/examples/mtl/mtl.h
@@ -20,7 +20,10 @@ void llama_mtl_free(struct ggml_mtl_context * ctx);
 // return 0 on success
 int llama_mtl_eval(
         struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf);
+             struct ggml_cgraph * gf,
+                      const int * tokens,
+                            int   n_tokens,
+                            int   n_past);
 
 #ifdef __cplusplus
 }
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 8f55f8467d338..06d8961eea020 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -36,6 +36,9 @@
     id<MTLFunction>             function_soft_max;
     id<MTLComputePipelineState> pipeline_soft_max;
 
+    id<MTLFunction>             function_diag_mask_inf;
+    id<MTLComputePipelineState> pipeline_diag_mask_inf;
+
     id<MTLFunction>             function_get_rows_q4_0;
     id<MTLComputePipelineState> pipeline_get_rows_q4_0;
 
@@ -150,6 +153,10 @@
         ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
         fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
 
+        ctx->function_diag_mask_inf = [ctx->library newFunctionWithName:@"kernel_diag_mask_inf" constantValues:constants error:nil];
+        ctx->pipeline_diag_mask_inf = [ctx->device newComputePipelineStateWithFunction:ctx->function_diag_mask_inf error:nil];
+        fprintf(stderr, "%s: loaded kernel_diag_mask_inf: %p\n", __func__, (void *) ctx->pipeline_diag_mask_inf);
+
         ctx->function_get_rows_q4_0 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_0"];
         ctx->pipeline_get_rows_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_0 error:nil];
         fprintf(stderr, "%s: loaded kernel_get_rows_q4_0: %p\n", __func__, (void *) ctx->pipeline_get_rows_q4_0);
@@ -248,8 +255,14 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
 
 int llama_mtl_eval(
         struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf) {
-    fprintf(stderr, "%s: evaluating\n", __func__);
+             struct ggml_cgraph * gf,
+                      const int * tokens,
+                            int   n_tokens,
+                            int   n_past) {
+    fprintf(stderr, "%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
+
+    struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
+    memcpy(input->data, tokens, n_tokens * sizeof(int));
 
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
     id<MTLComputeCommandEncoder> encoder = nil;
@@ -371,6 +384,28 @@ int llama_mtl_eval(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+            case GGML_OP_DIAG_MASK_INF:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+
+                    [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                    [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                    [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
             case GGML_OP_MUL_MAT:
                 {
                     id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
@@ -550,7 +585,7 @@ int llama_mtl_eval(
                     const uint64_t nb2 = gf->nodes[i]->nb[2];
                     const uint64_t nb3 = gf->nodes[i]->nb[3];
 
-                    const int n_past = ((int32_t *) gf->nodes[i]->src1->data)[0]; // TODO: TMP !!!!!
+                    //const int n_past = ((int32_t *) gf->nodes[i]->src1->data)[0]; // TODO: TMP !!!!!
                     const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
                     const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
 
@@ -697,17 +732,15 @@ int llama_mtl_eval(
         if (t->type == GGML_TYPE_F32) {
             const const float * data = (float *) ctx->out.contents;
             printf("data: ");
-            int n = ggml_nelements(t);
-            if (n > 10) {
-                n = 10;
-            }
-            for (int i = 0; i < n; i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", data[i]);
             }
             printf("\n");
             double sum = 0.0;
             for (int i = 0; i < ggml_nelements(t); i++) {
-                sum += data[i];
+                double cur = data[i];
+                if (isinf(cur)) continue;
+                sum += cur;
             }
             printf("sum:  %f\n", sum);
         } else if (t->type == GGML_TYPE_F16) {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index b132be15e9da0..ef2b690c1e6aa 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -86,6 +86,42 @@ kernel void kernel_soft_max(
     }
 }
 
+//const int n  = ggml_nrows(src0);
+//const int nc = src0->ne[0];
+//const int nr = src0->ne[1];
+//const int nz = n/nr;
+//
+//assert( dst->nb[0] == sizeof(float));
+//assert(src0->nb[0] == sizeof(float));
+//
+//for (int k = 0; k < nz; k++) {
+//    for (int j = ith; j < nr; j += nth) {
+//        for (int i = n_past; i < nc; i++) {
+//            if (i > n_past + j) {
+//                *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+//            }
+//        }
+//    }
+//}
+
+kernel void kernel_diag_mask_inf(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant       int & n_past,
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i02 = tpig[2];
+    const int64_t i01 = tpig[1];
+    const int64_t i00 = tpig[0];
+
+    if (i00 > n_past + i01) {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+    } else {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+    }
+}
+
 kernel void kernel_get_rows_q4_0(
         device const  void * src0,
         device const   int * src1,
diff --git a/ggml.c b/ggml.c
index 1141361228299..fc3bdcf6b8eba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15061,7 +15061,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                             memcpy(&offs, args[2]->data, sizeof(offs));
 
                             tensor->data = ((char *) tensor->data) + offs;
-                            printf("xxxxxx offs: %zu\n", offs);
                         } break;
                     case GGML_OP_TRANSPOSE:
                         {
diff --git a/llama.cpp b/llama.cpp
index 28d489016042f..ff4268ed655bc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1332,14 +1332,14 @@ static bool llama_eval_internal(
             // KQ_scaled shape [n_past + N, N, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
             ggml_set_name(KQ_scaled, "KQ_scaled");
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(KQ_scaled, "mtl-check");
-            }
 
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             ggml_set_name(KQ_masked, "KQ_masked");
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(KQ_masked, "mtl-check");
+            }
 
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
@@ -1464,12 +1464,14 @@ static bool llama_eval_internal(
         auto print_t_f32 = [&](struct ggml_tensor * t) {
             float * data = (float *)t->data;
             printf("data: ");
-            for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", data[i]);
             }
             printf("\n");
             double sum = 0.0;
             for (int i = 0; i < ggml_nelements(t); i++) {
+                double cur = data[i];
+                if (isinf(cur)) continue;
                 sum += data[i];
             }
             printf("sum:  %f\n", sum);

From 17930fbcb7b3ea0b45942fca5a137e725862e3e1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 20:48:24 +0300
Subject: [PATCH 23/49] mtl : fix soft_max kernel

---
 examples/mtl/mtl.m     | 10 ++++++++-
 examples/mtl/mtl.metal | 49 ++++++++++++++++--------------------------
 llama.cpp              |  8 +++----
 3 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 06d8961eea020..bb0074a4c6828 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -378,11 +378,19 @@ int llama_mtl_eval(
                     id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
                     id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
+                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
+                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
+
                     [encoder setComputePipelineState:ctx->pipeline_soft_max];
                     [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_DIAG_MASK_INF:
                 {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index ef2b690c1e6aa..32e850297f024 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -31,9 +31,6 @@ static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, i
     }
 }
 
-// TODO: not needed
-constant int nsoftmax [[function_constant(0)]];
-
 kernel void kernel_add(
         device const float * src0,
         device const float * src1,
@@ -68,42 +65,34 @@ kernel void kernel_relu(
     dst[tpig] = max(0.0f, src0[tpig]);
 }
 
-// TODO: broken
 kernel void kernel_soft_max(
         device const float * src0,
-        device       float * dst) {
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i03 = tpig[2];
+    const int64_t i02 = tpig[1];
+    const int64_t i01 = tpig[0];
+
+    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
     float max = 0.0f;
-    for (int i = 0; i < nsoftmax; i++) {
-        max = MAX(max, src0[i]);
+    for (int i = 0; i < ne00; i++) {
+        max = MAX(max, psrc0[i]);
     }
     float sum = 0.0f;
-    for (int i = 0; i < nsoftmax; i++) {
-        dst[i] = exp(src0[i] - max);
-        sum += dst[i];
+    for (int i = 0; i < ne00; i++) {
+        pdst[i] = exp(psrc0[i] - max);
+        sum += pdst[i];
     }
-    for (int i = 0; i < nsoftmax; i++) {
-        dst[i] /= sum;
+    for (int i = 0; i < ne00; i++) {
+        pdst[i] /= sum;
     }
 }
 
-//const int n  = ggml_nrows(src0);
-//const int nc = src0->ne[0];
-//const int nr = src0->ne[1];
-//const int nz = n/nr;
-//
-//assert( dst->nb[0] == sizeof(float));
-//assert(src0->nb[0] == sizeof(float));
-//
-//for (int k = 0; k < nz; k++) {
-//    for (int j = ith; j < nr; j += nth) {
-//        for (int i = n_past; i < nc; i++) {
-//            if (i > n_past + j) {
-//                *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
-//            }
-//        }
-//    }
-//}
-
 kernel void kernel_diag_mask_inf(
         device const float * src0,
         device       float * dst,
diff --git a/llama.cpp b/llama.cpp
index ff4268ed655bc..6825636c840bb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1336,15 +1336,15 @@ static bool llama_eval_internal(
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             ggml_set_name(KQ_masked, "KQ_masked");
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(KQ_masked, "mtl-check");
-            }
 
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(KQ_soft_max, "mtl-check");
+            }
 
             // split cached V into n_head heads
             struct ggml_tensor * V =

From f67c2d8cabb21ae1088e9bbe827eee8298fa1b5c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:27:03 +0300
Subject: [PATCH 24/49] ggml : update ggml_nbytes() to handle non-contiguous
 tensors

---
 ggml.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index fc3bdcf6b8eba..330a896ca2c2d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3732,7 +3732,14 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+    // this should handle cases where the tensor is not contiguous in memory
+    // probaby just:
+    //
+    //     return tensor->ne[3]*tensor->nb[3]
+    //
+    // is enough, but just in case, adding the second part
+
+    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
 }
 
 int ggml_blck_size(enum ggml_type type) {

From a266c26de2030b94f608510ba0e70888a9881b76 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:27:24 +0300
Subject: [PATCH 25/49] mtl : verify V tensor contents

---
 examples/mtl/mtl.m | 23 +++++++++++++++--------
 llama.cpp          | 27 +++++++++++++++++++--------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index bb0074a4c6828..24f9479ced97b 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -752,19 +752,26 @@ int llama_mtl_eval(
             }
             printf("sum:  %f\n", sum);
         } else if (t->type == GGML_TYPE_F16) {
-            const ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
+            ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
             printf("data: ");
-            int n = ggml_nelements(t);
-            if (n > 10) {
-                n = 10;
-            }
-            for (int i = 0; i < n; i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", ggml_fp16_to_fp32(data[i]));
             }
             printf("\n");
             double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                sum += ggml_fp16_to_fp32(data[i]);
+            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+                            const float curf = ggml_fp16_to_fp32(cur);
+                            if (isinf(curf)) continue;
+                            sum += curf;
+                        }
+                    }
+                }
             }
             printf("sum:  %f\n", sum);
         } else {
diff --git a/llama.cpp b/llama.cpp
index 6825636c840bb..2cf5a36fcaf22 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1341,11 +1341,6 @@ static bool llama_eval_internal(
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(KQ_soft_max, "mtl-check");
-            }
-
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
@@ -1355,6 +1350,11 @@ static bool llama_eval_internal(
                         il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
             ggml_set_name(V, "V");
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(V, "mtl-check");
+            }
+
 #if 1
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             ggml_set_name(KQV, "KQV");
@@ -1479,13 +1479,24 @@ static bool llama_eval_internal(
         auto print_t_f16 = [&](struct ggml_tensor * t) {
             ggml_fp16_t * data = (ggml_fp16_t *)t->data;
             printf("data: ");
-            for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", ggml_fp16_to_fp32(data[i]));
             }
             printf("\n");
             double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                sum += ggml_fp16_to_fp32(data[i]);
+            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+                            const float curf = ggml_fp16_to_fp32(cur);
+                            if (isinf(curf)) continue;
+                            sum += curf;
+                        }
+                    }
+                }
             }
             printf("sum:  %f\n", sum);
         };

From a0cc3de59ad9026079b7ab6d58da1c3b0cdfdd55 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:30:33 +0300
Subject: [PATCH 26/49] mtl : add f32 -> f32 cpy kernel

---
 examples/mtl/mtl.m     |  8 ++++++++
 examples/mtl/mtl.metal | 42 ++++++++++++++++++++++++++++++++++++++++++
 llama.cpp              | 10 +++++-----
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 24f9479ced97b..c617f44019062 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -53,6 +53,9 @@
 
     id<MTLFunction>             function_cpy_f32_f16;
     id<MTLComputePipelineState> pipeline_cpy_f32_f16;
+
+    id<MTLFunction>             function_cpy_f32_f32;
+    id<MTLComputePipelineState> pipeline_cpy_f32_f32;
 };
 
 // MSL code
@@ -176,6 +179,10 @@
         ctx->function_cpy_f32_f16 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f16"];
         ctx->pipeline_cpy_f32_f16 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f16 error:nil];
         fprintf(stderr, "%s: loaded kernel_cpy_f32_f16: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f16);
+
+        ctx->function_cpy_f32_f32 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f32"];
+        ctx->pipeline_cpy_f32_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f32 error:nil];
+        fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
     }
 
     // MTLBuffer approach
@@ -669,6 +676,7 @@ int llama_mtl_eval(
                             {
                                 switch (dstt) {
                                     case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
+                                    case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
                                     default: GGML_ASSERT(false && "not implemented");
                                 };
                             } break;
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 32e850297f024..172a0fa7e820a 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -340,3 +340,45 @@ kernel void kernel_cpy_f32_f16(
         dst_data[i00] = src[0];
     }
 }
+
+kernel void kernel_cpy_f32_f32(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index 2cf5a36fcaf22..40292305e051f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1350,11 +1350,6 @@ static bool llama_eval_internal(
                         il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
             ggml_set_name(V, "V");
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(V, "mtl-check");
-            }
-
 #if 1
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             ggml_set_name(KQV, "KQV");
@@ -1376,6 +1371,11 @@ static bool llama_eval_internal(
                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
             ggml_set_name(cur, "KQV_merged_contiguous");
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(cur, "mtl-check");
+            }
+
             // projection (no bias)
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].wo,

From 42dca4004cbe392cad5d454728711b62351a78a7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:35:11 +0300
Subject: [PATCH 27/49] mtl : add silu kernel

---
 examples/mtl/mtl.m     | 24 ++++++++++++++++++++++++
 examples/mtl/mtl.metal |  8 ++++++++
 llama.cpp              | 10 +++++-----
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index c617f44019062..7ad1722c0c1b9 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -30,6 +30,9 @@
     id<MTLFunction>             function_scale;
     id<MTLComputePipelineState> pipeline_scale;
 
+    id<MTLFunction>             function_silu;
+    id<MTLComputePipelineState> pipeline_silu;
+
     id<MTLFunction>             function_relu;
     id<MTLComputePipelineState> pipeline_relu;
 
@@ -148,6 +151,10 @@
         ctx->pipeline_scale = [ctx->device newComputePipelineStateWithFunction:ctx->function_scale error:nil];
         fprintf(stderr, "%s: loaded kernel_scale: %p\n", __func__, (void *) ctx->pipeline_scale);
 
+        ctx->function_silu = [ctx->library newFunctionWithName:@"kernel_silu"];
+        ctx->pipeline_silu = [ctx->device newComputePipelineStateWithFunction:ctx->function_silu error:nil];
+        fprintf(stderr, "%s: loaded kernel_silu: %p\n", __func__, (void *) ctx->pipeline_silu);
+
         ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
         ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
         fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
@@ -357,6 +364,23 @@ int llama_mtl_eval(
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
 
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_SILU:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 172a0fa7e820a..2c6386990417f 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -58,6 +58,14 @@ kernel void kernel_scale(
     dst[tpig] = src0[tpig] * scale;
 }
 
+kernel void kernel_silu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    float x = src0[tpig];
+    dst[tpig] = x / (1.0f + exp(-x));
+}
+
 kernel void kernel_relu(
         device const float * src0,
         device       float * dst,
diff --git a/llama.cpp b/llama.cpp
index 40292305e051f..52f91ae29f500 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1371,11 +1371,6 @@ static bool llama_eval_internal(
                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
             ggml_set_name(cur, "KQV_merged_contiguous");
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(cur, "mtl-check");
-            }
-
             // projection (no bias)
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].wo,
@@ -1407,6 +1402,11 @@ static bool llama_eval_internal(
             // SILU activation
             cur = ggml_silu(ctx0, cur);
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(cur, "mtl-check");
+            }
+
             cur = ggml_mul(ctx0, cur, tmp);
 
             cur = ggml_mul_mat(ctx0,

From fbd3f6258de75d7b00e469c43a82b29e8728ea83 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:40:53 +0300
Subject: [PATCH 28/49] mtl : add non-broadcast mul kernel

---
 examples/mtl/mtl.m     | 17 ++++++++++++++++-
 examples/mtl/mtl.metal | 10 +++++++++-
 llama.cpp              |  4 ++--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 7ad1722c0c1b9..2de105640d7fa 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -27,6 +27,10 @@
     id<MTLFunction>             function_mul;
     id<MTLComputePipelineState> pipeline_mul;
 
+    // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+    id<MTLFunction>             function_mul_row;
+    id<MTLComputePipelineState> pipeline_mul_row;
+
     id<MTLFunction>             function_scale;
     id<MTLComputePipelineState> pipeline_scale;
 
@@ -147,6 +151,10 @@
         ctx->pipeline_mul = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul error:nil];
         fprintf(stderr, "%s: loaded kernel_mul: %p\n", __func__, (void *) ctx->pipeline_mul);
 
+        ctx->function_mul_row = [ctx->library newFunctionWithName:@"kernel_mul_row"];
+        ctx->pipeline_mul_row = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_row error:nil];
+        fprintf(stderr, "%s: loaded kernel_mul_row: %p\n", __func__, (void *) ctx->pipeline_mul_row);
+
         ctx->function_scale = [ctx->library newFunctionWithName:@"kernel_scale"];
         ctx->pipeline_scale = [ctx->device newComputePipelineStateWithFunction:ctx->function_scale error:nil];
         fprintf(stderr, "%s: loaded kernel_scale: %p\n", __func__, (void *) ctx->pipeline_scale);
@@ -336,7 +344,14 @@ int llama_mtl_eval(
 
                     const int64_t ne00 = gf->nodes[i]->src0->ne[0];
 
-                    [encoder setComputePipelineState:ctx->pipeline_mul];
+                    const int64_t ne10 = gf->nodes[i]->src1->ne[0];
+
+                    if (ggml_nelements(gf->nodes[i]->src1) == ne10) {
+                        // src1 is a row
+                        [encoder setComputePipelineState:ctx->pipeline_mul_row];
+                    } else {
+                        [encoder setComputePipelineState:ctx->pipeline_mul];
+                    }
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 2c6386990417f..9ab51963f7e70 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -39,9 +39,17 @@ kernel void kernel_add(
     dst[tpig] = src0[tpig] + src1[tpig];
 }
 
+kernel void kernel_mul(
+        device const float * src0,
+        device const float * src1,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src1[tpig];
+}
+
 // assumption: src1 is a row
 // broadcast src1 into src0
-kernel void kernel_mul(
+kernel void kernel_mul_row(
         device const float * src0,
         device const float * src1,
         device       float * dst,
diff --git a/llama.cpp b/llama.cpp
index 52f91ae29f500..81d998c1898fd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1402,13 +1402,13 @@ static bool llama_eval_internal(
             // SILU activation
             cur = ggml_silu(ctx0, cur);
 
+            cur = ggml_mul(ctx0, cur, tmp);
+
             // TODO: TMP !!!!
             if (il == 0) {
                 ggml_set_name(cur, "mtl-check");
             }
 
-            cur = ggml_mul(ctx0, cur, tmp);
-
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].w2,
                     cur);

From 9665429e94d2fea77e8f28d196868f01c105ceec Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:50:01 +0300
Subject: [PATCH 29/49] mtl : full GPU inference of the computation graph

---
 examples/mtl/mtl.m |  86 ++++++++++++------------
 llama.cpp          | 160 ++++++++++++++++++++-------------------------
 2 files changed, 115 insertions(+), 131 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 2de105640d7fa..85003ebdd075f 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -782,49 +782,49 @@ int llama_mtl_eval(
     // TODO
     const float * logits = ctx->out.contents;
 
-    {
-        struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
-        if (t->type == GGML_TYPE_F32) {
-            const const float * data = (float *) ctx->out.contents;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", data[i]);
-            }
-            printf("\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                double cur = data[i];
-                if (isinf(cur)) continue;
-                sum += cur;
-            }
-            printf("sum:  %f\n", sum);
-        } else if (t->type == GGML_TYPE_F16) {
-            ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", ggml_fp16_to_fp32(data[i]));
-            }
-            printf("\n");
-            double sum = 0.0;
-            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-                            const float curf = ggml_fp16_to_fp32(cur);
-                            if (isinf(curf)) continue;
-                            sum += curf;
-                        }
-                    }
-                }
-            }
-            printf("sum:  %f\n", sum);
-        } else {
-            GGML_ASSERT(false && "not implemented");
-        }
-    }
+    //{
+    //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
+    //    if (t->type == GGML_TYPE_F32) {
+    //        const const float * data = (float *) ctx->out.contents;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", data[i]);
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        for (int i = 0; i < ggml_nelements(t); i++) {
+    //            double cur = data[i];
+    //            if (isinf(cur)) continue;
+    //            sum += cur;
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    } else if (t->type == GGML_TYPE_F16) {
+    //        ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+    //                        const float curf = ggml_fp16_to_fp32(cur);
+    //                        if (isinf(curf)) continue;
+    //                        sum += curf;
+    //                    }
+    //                }
+    //            }
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    } else {
+    //        GGML_ASSERT(false && "not implemented");
+    //    }
+    //}
 
     return 0;
 }
diff --git a/llama.cpp b/llama.cpp
index 81d998c1898fd..e0fbc6f73839d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1243,10 +1243,6 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
-    // TODO: TMP !!!
-    ggml_cgraph gf_export = {};
-    gf_export.n_threads = 1;
-
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1299,12 +1295,6 @@ static bool llama_eval_internal(
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
                 //ggml_build_forward_expand(&gf, t);
-
-                // TODO: TMP !!!!!!!!!!
-                if (il == 0) {
-                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
-                }
             }
 
             struct ggml_tensor * Q =
@@ -1404,11 +1394,6 @@ static bool llama_eval_internal(
 
             cur = ggml_mul(ctx0, cur, tmp);
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(cur, "mtl-check");
-            }
-
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].w2,
                     cur);
@@ -1444,84 +1429,83 @@ static bool llama_eval_internal(
     // logits -> probs
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
-    // TODO: TMP !!!!!!!!!!!!!!!!!!!!
     // run the computation
-    //ggml_build_forward_expand(&gf, inpL);
-    //ggml_graph_compute       (ctx0, &gf);
-
-    // lets export a smaller graph to get things rolling -- baby steps first
-    {
-        struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
-        if (!t) {
-            fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
-            exit(1);
-        }
-        ggml_build_forward_expand(&gf_export, t);
-    }
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
+
+    // TODO: not needed anymore, keeping for a bit
+    //// lets export a smaller graph to get things rolling -- baby steps first
+    //{
+    //    struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
+    //    if (!t) {
+    //        fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
+    //        exit(1);
+    //    }
+    //    ggml_build_forward_expand(&gf, t);
+    //}
 
     // print
-    {
-        auto print_t_f32 = [&](struct ggml_tensor * t) {
-            float * data = (float *)t->data;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", data[i]);
-            }
-            printf("\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                double cur = data[i];
-                if (isinf(cur)) continue;
-                sum += data[i];
-            }
-            printf("sum:  %f\n", sum);
-        };
-        auto print_t_f16 = [&](struct ggml_tensor * t) {
-            ggml_fp16_t * data = (ggml_fp16_t *)t->data;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", ggml_fp16_to_fp32(data[i]));
-            }
-            printf("\n");
-            double sum = 0.0;
-            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-                            const float curf = ggml_fp16_to_fp32(cur);
-                            if (isinf(curf)) continue;
-                            sum += curf;
-                        }
-                    }
-                }
-            }
-            printf("sum:  %f\n", sum);
-        };
-
-        ggml_graph_compute(ctx0, &gf_export);
-
-        {
-            auto * t = ggml_get_tensor(ctx0, "mtl-check");
-            switch (t->type) {
-                case GGML_TYPE_F32:
-                    print_t_f32(t);
-                    break;
-                case GGML_TYPE_F16:
-                    print_t_f16(t);
-                    break;
-                default:
-                    fprintf(stderr, "%s: unsupported type\n", __func__);
-                    exit(1);
-            }
-        }
-    }
+    //{
+    //    auto print_t_f32 = [&](struct ggml_tensor * t) {
+    //        float * data = (float *)t->data;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", data[i]);
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        for (int i = 0; i < ggml_nelements(t); i++) {
+    //            double cur = data[i];
+    //            if (isinf(cur)) continue;
+    //            sum += data[i];
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    };
+    //    auto print_t_f16 = [&](struct ggml_tensor * t) {
+    //        ggml_fp16_t * data = (ggml_fp16_t *)t->data;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+    //                        const float curf = ggml_fp16_to_fp32(cur);
+    //                        if (isinf(curf)) continue;
+    //                        sum += curf;
+    //                    }
+    //                }
+    //            }
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    };
+
+    //    ggml_graph_compute(ctx0, &gf);
+
+    //    {
+    //        auto * t = ggml_get_tensor(ctx0, "mtl-check");
+    //        switch (t->type) {
+    //            case GGML_TYPE_F32:
+    //                print_t_f32(t);
+    //                break;
+    //            case GGML_TYPE_F16:
+    //                print_t_f16(t);
+    //                break;
+    //            default:
+    //                fprintf(stderr, "%s: unsupported type\n", __func__);
+    //                exit(1);
+    //        }
+    //    }
+    //}
 
     if (cgraph_fname) {
-        //ggml_graph_export(&gf, cgraph_fname);
-        ggml_graph_export(&gf_export, cgraph_fname);
+        ggml_graph_export(&gf, cgraph_fname);
     }
 
 #ifdef GGML_PERF

From f0196a7e7a2ceddf66a0524b5818c02f0453243e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 22:51:42 +0300
Subject: [PATCH 30/49] mtl : optimize rms_norm and soft_max kernels

---
 examples/mtl/mtl.cpp   |   8 +--
 examples/mtl/mtl.m     |  59 ++++++++++++++-------
 examples/mtl/mtl.metal | 117 +++++++++++++++++++++++++++++++++--------
 ggml.c                 |   4 +-
 llama.cpp              |  25 +++++++--
 5 files changed, 166 insertions(+), 47 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 40e8fbceea39a..7f52453d830c1 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -41,13 +41,15 @@ int main(int argc, char ** argv) {
 
     // TODO: tmp to match the input used when creating the cgraph
     {
-        const int n_past  = 128;
-        const int n_batch = 32;
+        const int n_batch = 1;
+        const int n_past  = 512 - n_batch;
 
         const std::vector<int> tmp(n_batch, 1); // BOS
 
         // the actual inference happens here
-        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        for (int i = 0; i < 10; ++i) {
+            llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        }
     }
 
     llama_mtl_free(ctx_mtl);
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 85003ebdd075f..ff1adf6dfebd5 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -429,14 +429,17 @@ int llama_mtl_eval(
                     const int64_t ne02 = gf->nodes[i]->src0->ne[2];
                     const int64_t ne03 = gf->nodes[i]->src0->ne[3];
 
+                    const int nth = 32;
+
                     [encoder setComputePipelineState:ctx->pipeline_soft_max];
                     [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                     [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                     [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                    [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;
             case GGML_OP_DIAG_MASK_INF:
                 {
@@ -494,10 +497,10 @@ int llama_mtl_eval(
                     const enum ggml_type src1t = gf->nodes[i]->src1->type;
                     const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    printf("mul_mat: src0 - %s[%lld, %lld, %lld]\n", ggml_type_name(src0t), ne00, ne01, ne02);
-                    printf("mul_mat: src1 - %s[%lld, %lld, %lld]\n", ggml_type_name(src1t), ne10, ne11, ne12);
-                    printf("mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
-                    printf("mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
+                    fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld]\n", ggml_type_name(src0t), ne00, ne01, ne02);
+                    fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld]\n", ggml_type_name(src1t), ne10, ne11, ne12);
+                    fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
+                    fprintf(stderr, "mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
 
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
@@ -599,16 +602,19 @@ int llama_mtl_eval(
                     const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
                     const    float eps  = 1e-6f;
 
+                    const int nth = 32;
+
                     [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                     [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                     [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                    [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
 
                     const int64_t nrows = ggml_nrows(gf->nodes[i]->src0);
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;
             case GGML_OP_ROPE:
                 {
@@ -643,9 +649,9 @@ int llama_mtl_eval(
                     const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
                     const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
 
-                    printf("rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
+                    fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    fprintf(stderr, "rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
 
                     [encoder setComputePipelineState:ctx->pipeline_rope];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -704,11 +710,13 @@ int llama_mtl_eval(
                     const enum ggml_type src0t = gf->nodes[i]->src0->type;
                     const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    printf("cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    printf("cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
-                    printf("cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    printf("cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
-                    printf("cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
+                    const int nth = 32;
+
+                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
+                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
+                    fprintf(stderr, "cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
 
                     switch (src0t) {
                         case GGML_TYPE_F32:
@@ -741,7 +749,7 @@ int llama_mtl_eval(
                     [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
                     [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
@@ -764,8 +772,6 @@ int llama_mtl_eval(
         id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
         id<MTLBuffer> id_dst = ctx->out;
 
-        printf("XXXXX n = %d\n", ggml_nelements(out));
-
         id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
         [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
         [encoder_blit endEncoding];
@@ -776,12 +782,29 @@ int llama_mtl_eval(
 
     {
         const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        fprintf(stderr, "%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
+        printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
     }
 
     // TODO
     const float * logits = ctx->out.contents;
 
+    printf("logits: ");
+    for (int i = 0; i < 100; i++) {
+        printf("%8.4f ", logits[i]);
+    }
+    printf("\n");
+    double sum = 0.0;
+    int imax = 0;
+    double vmax = -INFINITY;
+    for (int i = 0; i < 32000; i++) {
+        sum += (double) logits[i];
+        if (logits[i] > vmax) {
+            vmax = logits[i];
+            imax = i;
+        }
+    }
+    printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+
     //{
     //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
     //    if (t->type == GGML_TYPE_F32) {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 9ab51963f7e70..f8446d17f0042 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -87,25 +87,80 @@ kernel void kernel_soft_max(
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i03 = tpig[2];
-    const int64_t i02 = tpig[1];
-    const int64_t i01 = tpig[0];
+        threadgroup float  * buf [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
 
     device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
     device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
-    float max = 0.0f;
-    for (int i = 0; i < ne00; i++) {
-        max = MAX(max, psrc0[i]);
+    //float max = 0.0f;
+    //for (int i = 0; i < ne00; i++) {
+    //    max = MAX(max, psrc0[i]);
+    //}
+    //float sum = 0.0f;
+    //for (int i = 0; i < ne00; i++) {
+    //    pdst[i] = exp(psrc0[i] - max);
+    //    sum += pdst[i];
+    //}
+    //for (int i = 0; i < ne00; i++) {
+    //    pdst[i] /= sum;
+    //}
+
+    // parallel max
+    buf[tpitg[0]] = -INFINITY;
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+        buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]);
+    }
+
+    // reduce
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = ntg[0]/2; i > 0; i /= 2) {
+        if (tpitg[0] < i) {
+            buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
     }
-    float sum = 0.0f;
-    for (int i = 0; i < ne00; i++) {
-        pdst[i] = exp(psrc0[i] - max);
-        sum += pdst[i];
+
+    // broadcast
+    if (tpitg[0] == 0) {
+        buf[0] = buf[0];
     }
-    for (int i = 0; i < ne00; i++) {
-        pdst[i] /= sum;
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    const float max = buf[0];
+
+    // parallel sum
+    buf[tpitg[0]] = 0.0f;
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+        buf[tpitg[0]] += exp(psrc0[i00] - max);
+    }
+
+    // reduce
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = ntg[0]/2; i > 0; i /= 2) {
+        if (tpitg[0] < i) {
+            buf[tpitg[0]] += buf[tpitg[0] + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    // broadcast
+    if (tpitg[0] == 0) {
+        buf[0] = buf[0];
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    const float sum = buf[0];
+
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+        pdst[i00] = exp(psrc0[i00] - max) / sum;
     }
 }
 
@@ -149,19 +204,39 @@ kernel void kernel_rms_norm(
         constant   int64_t & ne00,
         constant  uint64_t & nb01,
         constant     float & eps,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tpig*nb01);
+        threadgroup float  * sum [[threadgroup(0)]],
+        uint tgpig[[threadgroup_position_in_grid]],
+        uint tpitg[[thread_position_in_threadgroup]],
+        uint   ntg[[threads_per_threadgroup]]) {
+    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
+
+    // parallel sum
+    sum[tpitg] = 0.0f;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        sum[tpitg] += x[i00] * x[i00];
+    }
 
-    float sum = 0.0f;
-    for (int i00 = 0; i00 < ne00; i00++) {
-        sum += x[i00] * x[i00];
+    // reduce
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = ntg/2; i > 0; i /= 2) {
+        if (tpitg < i) {
+            sum[tpitg] += sum[tpitg + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-    const float mean  = sum/ne00;
+    // broadcast
+    if (tpitg == 0) {
+        sum[0] /= ne00;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    const float mean  = sum[0];
     const float scale = 1.0f/sqrt(mean + eps);
 
-    device float * y = dst + tpig*ne00;
-    for (int i00 = 0; i00 < ne00; i00++) {
+    device float * y = dst + tgpig*ne00;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
         y[i00] = x[i00] * scale;
     }
 }
diff --git a/ggml.c b/ggml.c
index 330a896ca2c2d..1c9bb4e61cb14 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14647,8 +14647,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    assert(cgraph->work      == NULL);
-    assert(cgraph->work_size == 0);
+    //assert(cgraph->work      == NULL);
+    //assert(cgraph->work_size == 0);
 
     uint64_t size_eval = 0;
 
diff --git a/llama.cpp b/llama.cpp
index e0fbc6f73839d..c998a77fb7dba 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1506,6 +1506,25 @@ static bool llama_eval_internal(
 
     if (cgraph_fname) {
         ggml_graph_export(&gf, cgraph_fname);
+
+        float * logits = (float *) ggml_get_data(inpL);
+
+        printf("logits: ");
+        for (int i = 0; i < 10; i++) {
+            printf("%8.4f ", logits[i]);
+        }
+        printf("\n");
+        double sum = 0.0;
+        int imax = 0;
+        double vmax = -INFINITY;
+        for (int i = 0; i < 32000; i++) {
+            sum += (double) logits[i];
+            if (logits[i] > vmax) {
+                vmax = logits[i];
+                imax = i;
+            }
+        }
+        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
     }
 
 #ifdef GGML_PERF
@@ -3002,11 +3021,11 @@ int llama_eval(
 
 int llama_eval_export(struct llama_context * ctx, const char * fname) {
     // these values determine the maximum inference sizes of the exported computation graph
-    // TODO: TMP !!!
+    // TODO: need to increase buffers to support the full context
     //const int n_ctx   = ctx->model.hparams.n_ctx;
     //const int n_batch = 512;
-    const int n_ctx   = 128;
-    const int n_batch = 32;
+    const int n_batch = 1;
+    const int n_ctx   = 512 - n_batch;
 
     const std::vector<llama_token> tmp(n_batch, llama_token_bos());
 

From e55f7b0bdb576567c48c3374d425651e3a4a5d07 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 23:37:49 +0300
Subject: [PATCH 31/49] mtl : add f16 mat x f32 vec multiplication kernel

---
 examples/mtl/mtl.m     | 36 ++++++++++++++------
 examples/mtl/mtl.metal | 76 +++++++++++++++++++++++-------------------
 2 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index ff1adf6dfebd5..372396047c5ea 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -52,8 +52,11 @@
     id<MTLFunction>             function_rms_norm;
     id<MTLComputePipelineState> pipeline_rms_norm;
 
-    id<MTLFunction>             function_mul_mat_q4_0;
-    id<MTLComputePipelineState> pipeline_mul_mat_q4_0;
+    id<MTLFunction>             function_mul_mat_q4_0_f32;
+    id<MTLComputePipelineState> pipeline_mul_mat_q4_0_f32;
+
+    id<MTLFunction>             function_mul_mat_f16_f32;
+    id<MTLComputePipelineState> pipeline_mul_mat_f16_f32;
 
     id<MTLFunction>             function_rope;
     id<MTLComputePipelineState> pipeline_rope;
@@ -183,9 +186,13 @@
         ctx->pipeline_rms_norm = [ctx->device newComputePipelineStateWithFunction:ctx->function_rms_norm error:nil];
         fprintf(stderr, "%s: loaded kernel_rms_norm: %p\n", __func__, (void *) ctx->pipeline_rms_norm);
 
-        ctx->function_mul_mat_q4_0 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0"];
-        ctx->pipeline_mul_mat_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0 error:nil];
-        fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0);
+        ctx->function_mul_mat_q4_0_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0_f32"];
+        ctx->pipeline_mul_mat_q4_0_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0_f32 error:nil];
+        fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0_f32);
+
+        ctx->function_mul_mat_f16_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_f16_f32"];
+        ctx->pipeline_mul_mat_f16_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_f16_f32 error:nil];
+        fprintf(stderr, "%s: loaded kernel_mul_mat_f16_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_f16_f32);
 
         ctx->function_rope = [ctx->library newFunctionWithName:@"kernel_rope"];
         ctx->pipeline_rope = [ctx->device newComputePipelineStateWithFunction:ctx->function_rope error:nil];
@@ -493,6 +500,8 @@ int llama_mtl_eval(
                     //const uint64_t nb1 = gf->nodes[i]->nb[1];
                     const uint64_t nb2 = gf->nodes[i]->nb[2];
 
+                    const int nth = 16;
+
                     const enum ggml_type src0t = gf->nodes[i]->src0->type;
                     const enum ggml_type src1t = gf->nodes[i]->src1->type;
                     const enum ggml_type dstt  = gf->nodes[i]->type;
@@ -505,7 +514,7 @@ int llama_mtl_eval(
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
 
-                    if (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) {
+                    if ((src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
                         if (encoder != nil) {
                             [encoder endEncoding];
                             encoder = nil;
@@ -528,6 +537,8 @@ int llama_mtl_eval(
                             initWithDevice:ctx->device transposeLeft:false transposeRight:true
                                 resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
 
+                        // we need to do ne02 multiplications
+                        // TODO: is there a way to do this in parallel - currently very slow ..
                         for (int64_t i02 = 0; i02 < ne02; ++i02) {
                             size_t offs_src0_cur = offs_src0 + i02*nb02;
                             size_t offs_src1_cur = offs_src1 + i02*nb12;
@@ -544,8 +555,13 @@ int llama_mtl_eval(
                             encoder = [command_buffer computeCommandEncoder];
                         }
 
-                        // for Q4 x F32 we use custom kernel
-                        [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
+                        // use custom matrix x vector kernel
+                        switch (src0t) {
+                            case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; break;
+                            case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        };
+
                         [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                         [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                         [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -555,9 +571,9 @@ int llama_mtl_eval(
                         [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
                         [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
                         [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+                        [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
 
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                     }
                 } break;
             case GGML_OP_GET_ROWS:
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index f8446d17f0042..1bada42dd7454 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -241,7 +241,7 @@ kernel void kernel_rms_norm(
     }
 }
 
-kernel void kernel_mul_mat_q4_0(
+kernel void kernel_mul_mat_q4_0_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -268,39 +268,6 @@ kernel void kernel_mul_mat_q4_0(
     sum[tpitg.x] = 0.0f;
 
     for (int i = tpitg.x; i < nb; i += tptg.x) {
-        //device const uint4  * x0p = (device const  uint4 *) (x + i)->qs;
-        //device const float4 * y0p = (device const float4 *) (y + i*qk);
-
-        //const uint4 x0 = *x0p;
-
-        //const uint4 x0l = (x0 & uint4(0x0F0F0F0F));
-        //const uint4 x0h = (x0 & uint4(0xF0F0F0F0)) >> 4;
-
-        //thread const char * x0lsb = (thread const char *) &x0l;
-        //thread const char * x0hsb = (thread const char *) &x0h;
-
-        //const float4 y00 = *(y0p + 0);
-        //const float4 y01 = *(y0p + 1);
-        //const float4 y02 = *(y0p + 2);
-        //const float4 y03 = *(y0p + 3);
-        //const float4 y04 = *(y0p + 4);
-        //const float4 y05 = *(y0p + 5);
-        //const float4 y06 = *(y0p + 6);
-        //const float4 y07 = *(y0p + 7);
-
-        //const half d = (x + i)->d;
-
-        //sum[tpitg.x] += (
-        //        (x0lsb[ 0] - 8)*y00[0] + (x0lsb[ 1] - 8)*y00[1] + (x0lsb[ 2] - 8)*y00[2] + (x0lsb[ 3] - 8)*y00[3] +
-        //        (x0lsb[ 4] - 8)*y01[0] + (x0lsb[ 5] - 8)*y01[1] + (x0lsb[ 6] - 8)*y01[2] + (x0lsb[ 7] - 8)*y01[3] +
-        //        (x0lsb[ 8] - 8)*y02[0] + (x0lsb[ 9] - 8)*y02[1] + (x0lsb[10] - 8)*y02[2] + (x0lsb[11] - 8)*y02[3] +
-        //        (x0lsb[12] - 8)*y03[0] + (x0lsb[13] - 8)*y03[1] + (x0lsb[14] - 8)*y03[2] + (x0lsb[15] - 8)*y03[3] +
-        //        (x0hsb[ 0] - 8)*y04[0] + (x0hsb[ 1] - 8)*y04[1] + (x0hsb[ 2] - 8)*y04[2] + (x0hsb[ 3] - 8)*y04[3] +
-        //        (x0hsb[ 4] - 8)*y05[0] + (x0hsb[ 5] - 8)*y05[1] + (x0hsb[ 6] - 8)*y05[2] + (x0hsb[ 7] - 8)*y05[3] +
-        //        (x0hsb[ 8] - 8)*y06[0] + (x0hsb[ 9] - 8)*y06[1] + (x0hsb[10] - 8)*y06[2] + (x0hsb[11] - 8)*y06[3] +
-        //        (x0hsb[12] - 8)*y07[0] + (x0hsb[13] - 8)*y07[1] + (x0hsb[14] - 8)*y07[2] + (x0hsb[15] - 8)*y07[3]
-        //        ) * d;
-
         device const uchar * x0p = (device const uchar *) (x + i)->qs;
         device const float * y0p = (device const float *) (y + i*qk);
 
@@ -335,6 +302,47 @@ kernel void kernel_mul_mat_q4_0(
     }
 }
 
+kernel void kernel_mul_mat_f16_f32(
+        device const  half * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        threadgroup float  * sum [[threadgroup(0)]],
+        uint2 tgpig[[threadgroup_position_in_grid]],
+        uint2  tpig[[thread_position_in_grid]],
+        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint2  tptg[[threads_per_threadgroup]]) {
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+
+    device const half  * x = src0 + r0*ne00;
+    device const float * y = src1 + r1*ne10;
+
+    sum[tpitg.x] = 0.0f;
+
+    for (int i = tpitg.x; i < ne00; i += tptg.x) {
+        sum[tpitg.x] += (float) x[i] * (float) y[i];
+    }
+
+    // accumulate the sum from all threads in the threadgroup
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = tptg.x/2; i > 0; i /= 2) {
+        if (tpitg.x < i) {
+            sum[tpitg.x] += sum[tpitg.x + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    if (tpitg.x == 0) {
+        dst[r1*ne0 + r0] = sum[0];
+    }
+}
+
 kernel void kernel_rope(
         device const  void * src0,
         device       float * dst,

From 33671460b0d5b39b470a2e25a0f1a85ac0ac5da6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 18:23:51 +0300
Subject: [PATCH 32/49] mtl : fix bug in f16 x f32 mul mat + speed-up
 computation

---
 examples/mtl/mtl.m     | 67 +++++++++++++++++++++++++++++-------------
 examples/mtl/mtl.metal | 44 +++++++++++++++++----------
 ggml.c                 |  4 +--
 ggml.h                 |  3 ++
 4 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 372396047c5ea..ebd3d6235c51d 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -480,41 +480,41 @@ int llama_mtl_eval(
                     const int64_t ne01 = gf->nodes[i]->src0->ne[1];
                     const int64_t ne02 = gf->nodes[i]->src0->ne[2];
 
-                    //const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
-                    //const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
+                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
                     const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
 
                     const int64_t ne10 = gf->nodes[i]->src1->ne[0];
                     const int64_t ne11 = gf->nodes[i]->src1->ne[1];
                     const int64_t ne12 = gf->nodes[i]->src1->ne[2];
 
-                    //const uint64_t nb10 = gf->nodes[i]->src1->nb[0];
-                    //const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
+                    const uint64_t nb10 = gf->nodes[i]->src1->nb[0];
+                    const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
                     const uint64_t nb12 = gf->nodes[i]->src1->nb[2];
 
                     const int64_t ne0  = gf->nodes[i]->ne[0];
                     const int64_t ne1  = gf->nodes[i]->ne[1];
                     const int64_t ne2  = gf->nodes[i]->ne[2];
 
-                    //const uint64_t nb0 = gf->nodes[i]->nb[0];
-                    //const uint64_t nb1 = gf->nodes[i]->nb[1];
+                    const uint64_t nb0 = gf->nodes[i]->nb[0];
+                    const uint64_t nb1 = gf->nodes[i]->nb[1];
                     const uint64_t nb2 = gf->nodes[i]->nb[2];
 
-                    const int nth = 16;
-
                     const enum ggml_type src0t = gf->nodes[i]->src0->type;
                     const enum ggml_type src1t = gf->nodes[i]->src1->type;
                     const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld]\n", ggml_type_name(src0t), ne00, ne01, ne02);
-                    fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld]\n", ggml_type_name(src1t), ne10, ne11, ne12);
+                    fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src0t), ne00, ne01, ne02, ggml_is_contiguous(gf->nodes[i]->src0));
+                    fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src1t), ne10, ne11, ne12, ggml_is_contiguous(gf->nodes[i]->src1));
                     fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
                     fprintf(stderr, "mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
 
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
 
-                    if ((src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
+                    if (ggml_is_contiguous(gf->nodes[i]->src0) &&
+                        ggml_is_contiguous(gf->nodes[i]->src1) &&
+                        (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
                         if (encoder != nil) {
                             [encoder endEncoding];
                             encoder = nil;
@@ -555,25 +555,52 @@ int llama_mtl_eval(
                             encoder = [command_buffer computeCommandEncoder];
                         }
 
+                        int nth = 32;
+
                         // use custom matrix x vector kernel
                         switch (src0t) {
-                            case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; break;
-                            case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; break;
+                            case GGML_TYPE_Q4_0:
+                                {
+                                    GGML_ASSERT(ne02 == 1);
+                                    GGML_ASSERT(ne12 == 1);
+
+                                    nth = 4;
+                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    GGML_ASSERT(ne02 == ne12);
+
+                                    nth = 32;
+                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                } break;
                             default: GGML_ASSERT(false && "not implemented");
                         };
 
+
                         [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                         [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                         [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                         [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                         [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
-                        [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+
+                        if (src0t == GGML_TYPE_Q4_0) {
+                            [encoder setThreadgroupMemoryLength:16*nth*sizeof(float) atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth, 16, 1)];
+                        } else {
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        }
                     }
                 } break;
             case GGML_OP_GET_ROWS:
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 1bada42dd7454..2272f9ff3ee12 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -265,7 +265,10 @@ kernel void kernel_mul_mat_q4_0_f32(
     device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
 
-    sum[tpitg.x] = 0.0f;
+    const uint nth = tptg.x*tptg.y;
+    const uint ith = 16*tpitg.x + tpitg.y;
+
+    sum[ith] = 0.0f;
 
     for (int i = tpitg.x; i < nb; i += tptg.x) {
         device const uchar * x0p = (device const uchar *) (x + i)->qs;
@@ -273,7 +276,9 @@ kernel void kernel_mul_mat_q4_0_f32(
 
         float acc = 0.0f;
 
-        for (int j = 0; j < 16; ++j) {
+        //for (int j = 0; j < 16; ++j) {
+        const int j = tpitg.y;
+        {
             const uchar x0v = *(x0p + j);
 
             const int x0 = x0v & 0x0F;
@@ -285,43 +290,50 @@ kernel void kernel_mul_mat_q4_0_f32(
             acc += (x0 - 8)*y0 + (x1 - 8)*y1;
         }
 
-        sum[tpitg.x] += acc * (x + i)->d;
+        sum[ith] += acc * (x + i)->d;
     }
 
     // accumulate the sum from all threads in the threadgroup
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = tptg.x/2; i > 0; i /= 2) {
-        if (tpitg.x < i) {
-            sum[tpitg.x] += sum[tpitg.x + i];
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (ith < i) {
+            sum[ith] += sum[ith + i];
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-    if (tpitg.x == 0) {
+    if (ith == 0) {
         dst[r1*ne0 + r0] = sum[0];
     }
 }
 
 kernel void kernel_mul_mat_f16_f32(
-        device const  half * src0,
-        device const float * src1,
+        device const  char * src0,
+        device const  char * src1,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         threadgroup float  * sum [[threadgroup(0)]],
-        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2  tpig[[thread_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tpig[[thread_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3  tptg[[threads_per_threadgroup]]) {
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
+    const int64_t im = tgpig.z;
 
-    device const half  * x = src0 + r0*ne00;
-    device const float * y = src1 + r1*ne10;
+    device const half  * x = (device const half  *) (src0 + r0*nb01 + im*nb02);
+    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
 
     sum[tpitg.x] = 0.0f;
 
@@ -339,7 +351,7 @@ kernel void kernel_mul_mat_f16_f32(
     }
 
     if (tpitg.x == 0) {
-        dst[r1*ne0 + r0] = sum[0];
+        dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
     }
 }
 
diff --git a/ggml.c b/ggml.c
index 1c9bb4e61cb14..b5e6997dd2d71 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3821,11 +3821,11 @@ size_t ggml_tensor_overhead(void) {
     return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
 }
 
-static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
 
-static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
diff --git a/ggml.h b/ggml.h
index 1f033b4920af6..7f821cf32e966 100644
--- a/ggml.h
+++ b/ggml.h
@@ -442,6 +442,9 @@ extern "C" {
     // TODO: temporary until model loading of ggml examples is refactored
     GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
 
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
 

From 847bbfe9e6cd9ba99068ebe0c803b4dc74366802 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 18:28:31 +0300
Subject: [PATCH 33/49] mtl : faster mul_mat_q4_0_f32 kernel

---
 examples/mtl/mtl.m     | 17 +++++++------
 examples/mtl/mtl.metal | 54 +++++++++++++++++++++++++++++++++---------
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index ebd3d6235c51d..4ef1efae41b85 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -555,7 +555,8 @@ int llama_mtl_eval(
                             encoder = [command_buffer computeCommandEncoder];
                         }
 
-                        int nth = 32;
+                        int nth0 = 32;
+                        int nth1 = 1;
 
                         // use custom matrix x vector kernel
                         switch (src0t) {
@@ -564,14 +565,16 @@ int llama_mtl_eval(
                                     GGML_ASSERT(ne02 == 1);
                                     GGML_ASSERT(ne12 == 1);
 
-                                    nth = 4;
+                                    nth0 = 8;
+                                    nth1 = 4;
                                     [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
                                 } break;
                             case GGML_TYPE_F16:
                                 {
                                     GGML_ASSERT(ne02 == ne12);
 
-                                    nth = 32;
+                                    nth0 = 32;
+                                    nth1 = 1;
                                     [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                 } break;
                             default: GGML_ASSERT(false && "not implemented");
@@ -595,11 +598,11 @@ int llama_mtl_eval(
                         [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
                         if (src0t == GGML_TYPE_Q4_0) {
-                            [encoder setThreadgroupMemoryLength:16*nth*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth, 16, 1)];
+                            [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                         } else {
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                         }
                     }
                 } break;
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 2272f9ff3ee12..5bbbfecd64427 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -266,31 +266,63 @@ kernel void kernel_mul_mat_q4_0_f32(
     device const float      * y = (device const float      *) src1 + r1*ne10;
 
     const uint nth = tptg.x*tptg.y;
-    const uint ith = 16*tpitg.x + tpitg.y;
+    const uint ith = tptg.y*tpitg.x + tpitg.y;
 
     sum[ith] = 0.0f;
 
     for (int i = tpitg.x; i < nb; i += tptg.x) {
-        device const uchar * x0p = (device const uchar *) (x + i)->qs;
-        device const float * y0p = (device const float *) (y + i*qk);
+        device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
+        device const float4 * y0p = (device const float4 *) (y + i*qk);
+
+        const float d = (float)((x + i)->d);
+
+        const uchar4 x0v = *(x0p + tpitg.y);
+        const float4 y0v = *(y0p + tpitg.y + 0);
+        const float4 y1v = *(y0p + tpitg.y + 4);
 
         float acc = 0.0f;
 
-        //for (int j = 0; j < 16; ++j) {
-        const int j = tpitg.y;
         {
-            const uchar x0v = *(x0p + j);
+            const int x0 = x0v[0] & 0x0F;
+            const int x1 = x0v[0] >>   4;
+
+            const float y0 = y0v[0];
+            const float y1 = y1v[0];
+
+            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
+        }
+
+        {
+            const int x0 = x0v[1] & 0x0F;
+            const int x1 = x0v[1] >>   4;
 
-            const int x0 = x0v & 0x0F;
-            const int x1 = x0v >>   4;
+            const float y0 = y0v[1];
+            const float y1 = y1v[1];
+
+            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
+        }
+
+        {
+            const int x0 = x0v[2] & 0x0F;
+            const int x1 = x0v[2] >>   4;
+
+            const float y0 = y0v[2];
+            const float y1 = y1v[2];
+
+            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
+        }
+
+        {
+            const int x0 = x0v[3] & 0x0F;
+            const int x1 = x0v[3] >>   4;
 
-            const float y0 = *(y0p + j);
-            const float y1 = *(y0p + j + 16);
+            const float y0 = y0v[3];
+            const float y1 = y1v[3];
 
             acc += (x0 - 8)*y0 + (x1 - 8)*y1;
         }
 
-        sum[ith] += acc * (x + i)->d;
+        sum[ith] += acc*d;
     }
 
     // accumulate the sum from all threads in the threadgroup

From 70c3387726f9af4dcd6948e84a9adfe2c4c0f244 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 19:11:39 +0300
Subject: [PATCH 34/49] mtl : fix kernel signature + roll inner loop

---
 examples/mtl/mtl.m     |  1 +
 examples/mtl/mtl.metal | 53 +++++++++++-------------------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 4ef1efae41b85..c74c28cd953b5 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -598,6 +598,7 @@ int llama_mtl_eval(
                         [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
                         if (src0t == GGML_TYPE_Q4_0) {
+                            //printf("nb = %d\n", ne00/32);
                             [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                         } else {
diff --git a/examples/mtl/mtl.metal b/examples/mtl/mtl.metal
index 5bbbfecd64427..53f7f7448b14f 100644
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
@@ -247,8 +247,14 @@ kernel void kernel_mul_mat_q4_0_f32(
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
         constant   int64_t & ne10,
         constant   int64_t & ne11,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         threadgroup float  * sum [[threadgroup(0)]],
@@ -256,12 +262,11 @@ kernel void kernel_mul_mat_q4_0_f32(
         uint2  tpig[[thread_position_in_grid]],
         uint2 tpitg[[thread_position_in_threadgroup]],
         uint2  tptg[[threads_per_threadgroup]]) {
+    const int nb = ne00/QK4_0;
+
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
 
-    const int qk = QK4_0;
-    const int nb = ne00/qk;
-
     device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
 
@@ -272,7 +277,7 @@ kernel void kernel_mul_mat_q4_0_f32(
 
     for (int i = tpitg.x; i < nb; i += tptg.x) {
         device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
-        device const float4 * y0p = (device const float4 *) (y + i*qk);
+        device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
 
         const float d = (float)((x + i)->d);
 
@@ -282,42 +287,12 @@ kernel void kernel_mul_mat_q4_0_f32(
 
         float acc = 0.0f;
 
-        {
-            const int x0 = x0v[0] & 0x0F;
-            const int x1 = x0v[0] >>   4;
-
-            const float y0 = y0v[0];
-            const float y1 = y1v[0];
-
-            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
-        }
-
-        {
-            const int x0 = x0v[1] & 0x0F;
-            const int x1 = x0v[1] >>   4;
-
-            const float y0 = y0v[1];
-            const float y1 = y1v[1];
-
-            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
-        }
-
-        {
-            const int x0 = x0v[2] & 0x0F;
-            const int x1 = x0v[2] >>   4;
-
-            const float y0 = y0v[2];
-            const float y1 = y1v[2];
-
-            acc += (x0 - 8)*y0 + (x1 - 8)*y1;
-        }
-
-        {
-            const int x0 = x0v[3] & 0x0F;
-            const int x1 = x0v[3] >>   4;
+        for (int j = 0; j < 4; ++j) {
+            const int x0 = x0v[j] & 0x0F;
+            const int x1 = x0v[j] >>   4;
 
-            const float y0 = y0v[3];
-            const float y1 = y1v[3];
+            const float y0 = y0v[j];
+            const float y1 = y1v[j];
 
             acc += (x0 - 8)*y0 + (x1 - 8)*y1;
         }

From b088e14a7e03104d9e4c027c34b4b7b8b37a124c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 19:26:58 +0300
Subject: [PATCH 35/49] mtl : more threads for rms_norm + better timing

---
 examples/mtl/mtl.cpp | 13 ++++++++++++-
 examples/mtl/mtl.m   | 41 +++++++++++++++++++++--------------------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 7f52453d830c1..b7b84cecf4402 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -46,10 +46,21 @@ int main(int argc, char ** argv) {
 
         const std::vector<int> tmp(n_batch, 1); // BOS
 
+        // warmup
+        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+
+        const int n_iter = 16;
+
+        const int64_t t0 = ggml_time_us();
+
         // the actual inference happens here
-        for (int i = 0; i < 10; ++i) {
+        for (int i = 0; i < n_iter; ++i) {
             llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
         }
+
+        const int64_t t1 = ggml_time_us();
+
+        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
     }
 
     llama_mtl_free(ctx_mtl);
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index c74c28cd953b5..2eb874884a07c 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -492,9 +492,9 @@ int llama_mtl_eval(
                     const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
                     const uint64_t nb12 = gf->nodes[i]->src1->nb[2];
 
-                    const int64_t ne0  = gf->nodes[i]->ne[0];
-                    const int64_t ne1  = gf->nodes[i]->ne[1];
-                    const int64_t ne2  = gf->nodes[i]->ne[2];
+                    const int64_t ne0 = gf->nodes[i]->ne[0];
+                    const int64_t ne1 = gf->nodes[i]->ne[1];
+                    const int64_t ne2 = gf->nodes[i]->ne[2];
 
                     const uint64_t nb0 = gf->nodes[i]->nb[0];
                     const uint64_t nb1 = gf->nodes[i]->nb[1];
@@ -515,6 +515,7 @@ int llama_mtl_eval(
                     if (ggml_is_contiguous(gf->nodes[i]->src0) &&
                         ggml_is_contiguous(gf->nodes[i]->src1) &&
                         (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
+
                         if (encoder != nil) {
                             [encoder endEncoding];
                             encoder = nil;
@@ -649,7 +650,7 @@ int llama_mtl_eval(
                     const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
                     const    float eps  = 1e-6f;
 
-                    const int nth = 32;
+                    const int nth = 256;
 
                     [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -779,22 +780,22 @@ int llama_mtl_eval(
 
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                    [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                    [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                    [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                    [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                    [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                    [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                    [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                    [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                    [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                    [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                    [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                    [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                    [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                    [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                    [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                    [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                    [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                    [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                    [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                    [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                    [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                    [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                    [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                    [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                    [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                    [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                    [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                    [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                    [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
 
                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;

From 627605732c9f28607467deb79d9349c06d17af15 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 19:58:08 +0300
Subject: [PATCH 36/49] mtl : remove printfs from inner loop

---
 examples/mtl/mtl.m | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 2eb874884a07c..f452979c40d07 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -263,15 +263,15 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
     id<MTLBuffer> result;
 
     if (is_data) {
-        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = ctx->buffer_data;
     } else {
-        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        //fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = ctx->buffer_eval;
     }
 
     if (result == nil) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        //fprintf(stderr, "%s: error: buffer is nil\n", __func__);
         GGML_ASSERT(false);
     }
 
@@ -310,7 +310,7 @@ int llama_mtl_eval(
     }
 
     for (int i = 0; i < gf->n_nodes; ++i) {
-        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+        //fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
         switch (gf->nodes[i]->op) {
             case GGML_OP_RESHAPE:
@@ -504,10 +504,10 @@ int llama_mtl_eval(
                     const enum ggml_type src1t = gf->nodes[i]->src1->type;
                     const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src0t), ne00, ne01, ne02, ggml_is_contiguous(gf->nodes[i]->src0));
-                    fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src1t), ne10, ne11, ne12, ggml_is_contiguous(gf->nodes[i]->src1));
-                    fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
-                    fprintf(stderr, "mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
+                    //fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src0t), ne00, ne01, ne02, ggml_is_contiguous(gf->nodes[i]->src0));
+                    //fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src1t), ne10, ne11, ne12, ggml_is_contiguous(gf->nodes[i]->src1));
+                    //fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
+                    //fprintf(stderr, "mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
 
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
@@ -599,7 +599,6 @@ int llama_mtl_eval(
                         [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
                         if (src0t == GGML_TYPE_Q4_0) {
-                            //printf("nb = %d\n", ne00/32);
                             [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                         } else {
@@ -697,9 +696,9 @@ int llama_mtl_eval(
                     const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
                     const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
 
-                    fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    fprintf(stderr, "rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
+                    //fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    //fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    //fprintf(stderr, "rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
 
                     [encoder setComputePipelineState:ctx->pipeline_rope];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -760,11 +759,11 @@ int llama_mtl_eval(
 
                     const int nth = 32;
 
-                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
-                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
-                    fprintf(stderr, "cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
+                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
+                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
+                    //fprintf(stderr, "cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
 
                     switch (src0t) {
                         case GGML_TYPE_F32:

From 03c2d72867fac57d155d8a2a714d0b156f243286 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 20:36:26 +0300
Subject: [PATCH 37/49] mtl : simplify implementation

---
 examples/mtl/mtl.m | 195 +++++++++++++--------------------------------
 1 file changed, 56 insertions(+), 139 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index f452979c40d07..89ed45c012520 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -257,7 +257,7 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
 
     const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
 
-    const size_t t_size = ggml_nbytes(t);
+    //const size_t t_size = ggml_nbytes(t);
     const size_t t_offs = is_data ? offs_data : offs_eval;
 
     id<MTLBuffer> result;
@@ -271,7 +271,7 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
     }
 
     if (result == nil) {
-        //fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
         GGML_ASSERT(false);
     }
 
@@ -296,9 +296,9 @@ int llama_mtl_eval(
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
     id<MTLComputeCommandEncoder> encoder = nil;
 
-    size_t offs_src0;
-    size_t offs_src1;
-    size_t offs_dst;
+    size_t offs_src0 = 0;
+    size_t offs_src1 = 0;
+    size_t offs_dst  = 0;
 
     // copy the input data to the GPU
     {
@@ -312,6 +312,48 @@ int llama_mtl_eval(
     for (int i = 0; i < gf->n_nodes; ++i) {
         //fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
+        struct ggml_tensor * src0 = gf->nodes[i]->src0;
+        struct ggml_tensor * src1 = gf->nodes[i]->src1;
+        struct ggml_tensor * dst  = gf->nodes[i];
+
+        const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+        const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+        const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+        const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+
+        const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+        const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+        const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+        const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+        const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+        const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+        const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+        //const int64_t  ne13 = src1 ? src1->ne[3] : 0;
+
+        const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+        const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+        const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+        //const uint64_t nb13 = src1 ? src1->nb[3] : 0;
+
+        const int64_t  ne0  = dst ? dst->ne[0] : 0;
+        const int64_t  ne1  = dst ? dst->ne[1] : 0;
+        const int64_t  ne2  = dst ? dst->ne[2] : 0;
+        const int64_t  ne3  = dst ? dst->ne[3] : 0;
+
+        const uint64_t nb0  = dst ? dst->nb[0] : 0;
+        const uint64_t nb1  = dst ? dst->nb[1] : 0;
+        const uint64_t nb2  = dst ? dst->nb[2] : 0;
+        const uint64_t nb3  = dst ? dst->nb[3] : 0;
+
+        const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+        const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+        const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+        id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
+        id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
+        id<MTLBuffer> id_dst  = dst  ? llama_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
+
         switch (gf->nodes[i]->op) {
             case GGML_OP_RESHAPE:
             case GGML_OP_VIEW:
@@ -326,10 +368,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
                     [encoder setComputePipelineState:ctx->pipeline_add];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -345,14 +383,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-
-                    const int64_t ne10 = gf->nodes[i]->src1->ne[0];
-
                     if (ggml_nelements(gf->nodes[i]->src1) == ne10) {
                         // src1 is a row
                         [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -374,9 +404,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
                     const float scale = *(const float *) gf->nodes[i]->src1->data;
 
                     [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -394,9 +421,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
                     [encoder setComputePipelineState:ctx->pipeline_silu];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -411,12 +435,9 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
                     [encoder setComputePipelineState:ctx->pipeline_relu];
-                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
 
@@ -428,19 +449,11 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
-                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
-                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
-
                     const int nth = 32;
 
                     [encoder setComputePipelineState:ctx->pipeline_soft_max];
-                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                     [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                     [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
@@ -454,16 +467,9 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
-                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
-
                     [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
-                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
                     [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
                     [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
@@ -472,38 +478,6 @@ int llama_mtl_eval(
                 } break;
             case GGML_OP_MUL_MAT:
                 {
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
-                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
-
-                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
-                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
-                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
-
-                    const int64_t ne10 = gf->nodes[i]->src1->ne[0];
-                    const int64_t ne11 = gf->nodes[i]->src1->ne[1];
-                    const int64_t ne12 = gf->nodes[i]->src1->ne[2];
-
-                    const uint64_t nb10 = gf->nodes[i]->src1->nb[0];
-                    const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
-                    const uint64_t nb12 = gf->nodes[i]->src1->nb[2];
-
-                    const int64_t ne0 = gf->nodes[i]->ne[0];
-                    const int64_t ne1 = gf->nodes[i]->ne[1];
-                    const int64_t ne2 = gf->nodes[i]->ne[2];
-
-                    const uint64_t nb0 = gf->nodes[i]->nb[0];
-                    const uint64_t nb1 = gf->nodes[i]->nb[1];
-                    const uint64_t nb2 = gf->nodes[i]->nb[2];
-
-                    const enum ggml_type src0t = gf->nodes[i]->src0->type;
-                    const enum ggml_type src1t = gf->nodes[i]->src1->type;
-                    const enum ggml_type dstt  = gf->nodes[i]->type;
-
                     //fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src0t), ne00, ne01, ne02, ggml_is_contiguous(gf->nodes[i]->src0));
                     //fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src1t), ne10, ne11, ne12, ggml_is_contiguous(gf->nodes[i]->src1));
                     //fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
@@ -613,10 +587,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
                     switch (gf->nodes[i]->src0->type) {
                         case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                         default: {
@@ -642,12 +612,7 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const  int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
-                    const    float eps  = 1e-6f;
+                    const float eps = 1e-6f;
 
                     const int nth = 256;
 
@@ -669,30 +634,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
-                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
-                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
-
-                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
-                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
-                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
-                    const uint64_t nb03 = gf->nodes[i]->src0->nb[3];
-
-                    const int64_t ne0 = gf->nodes[i]->ne[0];
-                    const int64_t ne1 = gf->nodes[i]->ne[1];
-                    const int64_t ne2 = gf->nodes[i]->ne[2];
-                    const int64_t ne3 = gf->nodes[i]->ne[3];
-
-                    const uint64_t nb0 = gf->nodes[i]->nb[0];
-                    const uint64_t nb1 = gf->nodes[i]->nb[1];
-                    const uint64_t nb2 = gf->nodes[i]->nb[2];
-                    const uint64_t nb3 = gf->nodes[i]->nb[3];
-
-                    //const int n_past = ((int32_t *) gf->nodes[i]->src1->data)[0]; // TODO: TMP !!!!!
                     const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
                     const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
 
@@ -731,32 +672,6 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ne00 = gf->nodes[i]->src0->ne[0];
-                    const int64_t ne01 = gf->nodes[i]->src0->ne[1];
-                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
-                    const int64_t ne03 = gf->nodes[i]->src0->ne[3];
-
-                    const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
-                    const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
-                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
-                    const uint64_t nb03 = gf->nodes[i]->src0->nb[3];
-
-                    const int64_t ne0 = gf->nodes[i]->ne[0];
-                    const int64_t ne1 = gf->nodes[i]->ne[1];
-                    const int64_t ne2 = gf->nodes[i]->ne[2];
-                    const int64_t ne3 = gf->nodes[i]->ne[3];
-
-                    const uint64_t nb0 = gf->nodes[i]->nb[0];
-                    const uint64_t nb1 = gf->nodes[i]->nb[1];
-                    const uint64_t nb2 = gf->nodes[i]->nb[2];
-                    const uint64_t nb3 = gf->nodes[i]->nb[3];
-
-                    const enum ggml_type src0t = gf->nodes[i]->src0->type;
-                    const enum ggml_type dstt  = gf->nodes[i]->type;
-
                     const int nth = 32;
 
                     //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
@@ -835,6 +750,7 @@ int llama_mtl_eval(
     // TODO
     const float * logits = ctx->out.contents;
 
+#if 1
     printf("logits: ");
     for (int i = 0; i < 100; i++) {
         printf("%8.4f ", logits[i]);
@@ -851,6 +767,7 @@ int llama_mtl_eval(
         }
     }
     printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+#endif
 
     //{
     //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");

From 640a8896329f73af36e7726c8ac0f6f2fca6a721 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 21:00:30 +0300
Subject: [PATCH 38/49] mtl : add save/load vocab to ggml file

---
 examples/mtl/mtl.cpp | 25 +++++++++++++++++++++++++
 examples/mtl/mtl.m   | 14 --------------
 llama.cpp            | 44 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index b7b84cecf4402..ff1c1f6858a0c 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -24,6 +24,31 @@ int main(int argc, char ** argv) {
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gf.n_threads = 1;
 
+    {
+        struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab");
+        if (t_vocab == NULL) {
+            fprintf(stderr, "%s: vocab tensor not found\n", __func__);
+            return -1;
+        }
+
+        const char * ptr = (const char *) t_vocab->data;
+
+        int32_t n_vocab = 0;
+        memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab);
+
+        printf("%s: n_vocab = %d\n", __func__, n_vocab);
+
+        for (int i = 0; i < 512; ++i) {
+            char text[32];
+            float score;
+
+            memcpy(text,   ptr, sizeof(text));  ptr += sizeof(text);
+            memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score);
+
+            printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score);
+        }
+    }
+
     // allocate work context
     static size_t buf_size = gf.work_size; // TODO
     static void * buf = malloc(buf_size);
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 89ed45c012520..e4839626e1ecf 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -108,20 +108,6 @@
             exit(1);
         }
     }
-#elif 0
-    // this does not work !?!?!
-
-    // load library from "mtl.metallib"
-    {
-        NSError * error = nil;
-
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"./mtl" ofType:@"metallib"];
-        ctx->library = [ctx->device newLibraryWithFile:path error:&error];
-        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
-        }
-    }
 #else
     // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource
     {
diff --git a/llama.cpp b/llama.cpp
index c998a77fb7dba..9a8bf9df7b732 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1505,7 +1505,49 @@ static bool llama_eval_internal(
     //}
 
     if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
+        {
+            char tmp[32]; // max token length
+
+            // store null-terminated string for simplicity
+            std::vector<uint8_t> buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float)));
+
+            uint64_t offs = 0;
+
+            {
+                const int32_t n = n_vocab;
+                memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n);
+            }
+
+            for (int i = 0; i < n_vocab; i++) {
+                const int32_t id = i;
+
+                const float score = lctx.vocab.id_to_token[id].score;
+                const std::string text = lctx.vocab.id_to_token[id].tok;
+
+                snprintf(tmp, sizeof(tmp), "%s", text.c_str());
+
+                memcpy(&buf_vocab[offs], tmp, 32); offs += 32;
+                memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score);
+            }
+
+            struct ggml_init_params params;
+            params.mem_size   = ggml_tensor_overhead();
+            params.mem_buffer = NULL;
+            params.no_alloc   = true;
+
+            ggml_context * ctx_vocab = ggml_init(params);
+
+            struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size());
+            t_vocab->data = buf_vocab.data();
+            ggml_set_name(t_vocab, "vocab");
+
+            gf.leafs[gf.n_leafs++] = t_vocab;
+
+            ggml_graph_export(&gf, cgraph_fname);
+
+            ggml_free(ctx_vocab);
+        }
 
         float * logits = (float *) ggml_get_data(inpL);
 

From 2f4e9d19cce4ca9dc0a37d9734df17fe8d03dd49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 21:52:11 +0300
Subject: [PATCH 39/49] mtl : plug Metal inference into llama.cpp (very
 quick-n-dirty)

---
 CMakeLists.txt      |  15 ++++-
 examples/common.cpp |   4 ++
 examples/common.h   |   1 +
 examples/mtl/mtl.h  |   2 +
 examples/mtl/mtl.m  | 151 ++++++++++++++++++++++++++++++--------------
 llama.cpp           |  73 +++++++++++++++++----
 llama.h             |   1 +
 7 files changed, 186 insertions(+), 61 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21f4ec9ddd267..bc23c2c5b5d80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,11 +384,22 @@ endif()
 add_library(llama
             llama.cpp
             llama.h
-            llama-util.h)
+            llama-util.h
+            examples/mtl/mtl.h # TODO: METAL TMP
+            examples/mtl/mtl.m # TODO: METAL TMP
+            )
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(llama PRIVATE
+    ggml
+    ${LLAMA_EXTRA_LIBS}
+    ${FOUNDATION_LIBRARY}         # TODO: METAL TMP
+    ${METAL_FRAMEWORK}            # TODO: METAL TMP
+    ${METALKIT_FRAMEWORK}         # TODO: METAL TMP
+    ${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
+    )
+target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/examples/common.cpp b/examples/common.cpp
index b5810f28f4901..53e9200fa90c4 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -301,6 +301,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
+        } else if (arg == "--import") {
+            params.import_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -441,6 +443,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stderr, "  --import              import a computation graph from 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -490,6 +493,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
+    lparams.cgraph       = params.import_cgraph;
 
     llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
 
diff --git a/examples/common.h b/examples/common.h
index 66bdeb5e9287d..c7d4d6e0e7bdf 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -72,6 +72,7 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
+    bool import_cgraph     = false; // import a computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/mtl/mtl.h b/examples/mtl/mtl.h
index a6a336eaac5d6..f381756d408c7 100644
--- a/examples/mtl/mtl.h
+++ b/examples/mtl/mtl.h
@@ -25,6 +25,8 @@ int llama_mtl_eval(
                             int   n_tokens,
                             int   n_past);
 
+float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index e4839626e1ecf..4ac5dac206a8b 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -6,11 +6,19 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+#ifdef LLAMA_MTL_NDEBUG
+#define mtl_printf(...)
+#else
+#define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
+#endif
+
 struct ggml_mtl_context {
     struct ggml_context * ctx_data;
     struct ggml_context * ctx_eval;
     struct ggml_context * ctx_work;
 
+    float * logits;
+
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
@@ -274,7 +282,44 @@ int llama_mtl_eval(
                       const int * tokens,
                             int   n_tokens,
                             int   n_past) {
-    fprintf(stderr, "%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
+    mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
+
+    // adjust dynamic shapes
+    // TODO: wrong ...
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "embd");
+    //    t->ne[0] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Qpre");
+    //    t->src0->ne[2] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Kpre");
+    //    t->src0->ne[2] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Vcur");
+    //    t->ne[0] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * k = ggml_graph_get_tensor(gf, "k");
+    //    struct ggml_tensor * v = ggml_graph_get_tensor(gf, "v");
+    //    k->ne[0] = n_tokens*v->ne[1];
+    //    v->ne[0] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Q");
+    //    t->ne[1] = n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "K");
+    //    t->ne[1] = n_past + n_tokens;
+    //}
+    //{
+    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "KQV_merged_contiguous");
+    //    t->src1->ne[1] = n_tokens;
+    //}
 
     struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
     memcpy(input->data, tokens, n_tokens * sizeof(int));
@@ -296,7 +341,7 @@ int llama_mtl_eval(
     }
 
     for (int i = 0; i < gf->n_nodes; ++i) {
-        //fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+        //mtl_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
         struct ggml_tensor * src0 = gf->nodes[i]->src0;
         struct ggml_tensor * src1 = gf->nodes[i]->src1;
@@ -340,7 +385,21 @@ int llama_mtl_eval(
         id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
         id<MTLBuffer> id_dst  = dst  ? llama_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
 
-        switch (gf->nodes[i]->op) {
+        //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+        //if (src0) {
+        //    mtl_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+        //            ggml_is_contiguous(src0), src0->name);
+        //}
+        //if (src1) {
+        //    mtl_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+        //            ggml_is_contiguous(src1), src1->name);
+        //}
+        //if (dst) {
+        //    mtl_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+        //            dst->name);
+        //}
+
+        switch (dst->op) {
             case GGML_OP_RESHAPE:
             case GGML_OP_VIEW:
             case GGML_OP_TRANSPOSE:
@@ -359,7 +418,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    const int64_t n = ggml_nelements(dst);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -369,7 +428,7 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    if (ggml_nelements(gf->nodes[i]->src1) == ne10) {
+                    if (ggml_nelements(src1) == ne10) {
                         // src1 is a row
                         [encoder setComputePipelineState:ctx->pipeline_mul_row];
                     } else {
@@ -380,7 +439,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                     [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    const int64_t n = ggml_nelements(dst);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -390,14 +449,14 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    const float scale = *(const float *) gf->nodes[i]->src1->data;
+                    const float scale = *(const float *) src1->data;
 
                     [encoder setComputePipelineState:ctx->pipeline_scale];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                     [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    const int64_t n = ggml_nelements(dst);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -411,7 +470,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    const int64_t n = ggml_nelements(dst);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -425,7 +484,7 @@ int llama_mtl_eval(
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    const int64_t n = ggml_nelements(dst);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -464,16 +523,11 @@ int llama_mtl_eval(
                 } break;
             case GGML_OP_MUL_MAT:
                 {
-                    //fprintf(stderr, "mul_mat: src0 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src0t), ne00, ne01, ne02, ggml_is_contiguous(gf->nodes[i]->src0));
-                    //fprintf(stderr, "mul_mat: src1 - %s[%lld, %lld, %lld], %d\n", ggml_type_name(src1t), ne10, ne11, ne12, ggml_is_contiguous(gf->nodes[i]->src1));
-                    //fprintf(stderr, "mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
-                    //fprintf(stderr, "mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
-
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
 
-                    if (ggml_is_contiguous(gf->nodes[i]->src0) &&
-                        ggml_is_contiguous(gf->nodes[i]->src1) &&
+                    if (ggml_is_contiguous(src0) &&
+                        ggml_is_contiguous(src1) &&
                         (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
 
                         if (encoder != nil) {
@@ -486,13 +540,13 @@ int llama_mtl_eval(
 
                         // for F32 x F32 we use MPS
                         MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:gf->nodes[i]->src0->nb[1] dataType:src0dt];
+                            matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
 
                         MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:gf->nodes[i]->src1->nb[1] dataType:src1dt];
+                            matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
 
                         MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+                            matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
 
                         MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
                             initWithDevice:ctx->device transposeLeft:false transposeRight:true
@@ -573,22 +627,22 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    switch (gf->nodes[i]->src0->type) {
+                    switch (src0->type) {
                         case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                         default: {
                                      // not implemented
-                                     fprintf(stderr, "%s: node %3d, op = %8s, type = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op), ggml_type_name(gf->nodes[i]->src0->type));
+                                     fprintf(stderr, "%s: node %3d, op = %8s, type = %8s not implemented\n", __func__, i, ggml_op_name(dst->op), ggml_type_name(src0->type));
                                  }
                     }
 
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&(gf->nodes[i]->src0->ne[0]) length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&(gf->nodes[i]->src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
-                    [encoder setBytes:&(gf->nodes[i]->nb[1])       length:sizeof(uint64_t) atIndex:5];
+                    [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
+                    [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
+                    [encoder setBytes:&(dst->nb[1])  length:sizeof(uint64_t) atIndex:5];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]->src1);
+                    const int64_t n = ggml_nelements(src1);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -610,7 +664,7 @@ int llama_mtl_eval(
                     [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
                     [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
 
-                    const int64_t nrows = ggml_nrows(gf->nodes[i]->src0);
+                    const int64_t nrows = ggml_nrows(src0);
 
                     [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;
@@ -620,12 +674,12 @@ int llama_mtl_eval(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
-                    const int n_dims = ((int32_t *) gf->nodes[i]->src1->data)[1];
-                    const int mode   = ((int32_t *) gf->nodes[i]->src1->data)[2];
+                    const int n_dims = ((int32_t *) src1->data)[1];
+                    const int mode   = ((int32_t *) src1->data)[2];
 
-                    //fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    //fprintf(stderr, "rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    //fprintf(stderr, "rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
+                    //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
 
                     [encoder setComputePipelineState:ctx->pipeline_rope];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -660,11 +714,11 @@ int llama_mtl_eval(
 
                     const int nth = 32;
 
-                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
-                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    //fprintf(stderr, "cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
-                    //fprintf(stderr, "cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
+                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
+                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
+                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
+                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
+                    //mtl_printf("cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
 
                     switch (src0t) {
                         case GGML_TYPE_F32:
@@ -700,7 +754,7 @@ int llama_mtl_eval(
                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 } break;
             default:
-                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                 GGML_ASSERT(false);
                 return -1;
         }
@@ -708,7 +762,7 @@ int llama_mtl_eval(
 
     // extract results from the GPU
     {
-        fprintf(stderr, "%s: extract results from the GPU\n", __func__);
+        mtl_printf("%s: extract results from the GPU\n", __func__);
 
         if (encoder != nil) {
             [encoder endEncoding];
@@ -730,18 +784,19 @@ int llama_mtl_eval(
 
     {
         const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
+        mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
     }
 
-    // TODO
-    const float * logits = ctx->out.contents;
+    ctx->logits = ctx->out.contents;
+
+    const float * logits = ctx->logits;
 
 #if 1
-    printf("logits: ");
+    mtl_printf("logits: ");
     for (int i = 0; i < 100; i++) {
-        printf("%8.4f ", logits[i]);
+        mtl_printf("%8.4f ", logits[i]);
     }
-    printf("\n");
+    mtl_printf("\n");
     double sum = 0.0;
     int imax = 0;
     double vmax = -INFINITY;
@@ -752,7 +807,7 @@ int llama_mtl_eval(
             imax = i;
         }
     }
-    printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+    mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
 #endif
 
     //{
@@ -801,3 +856,7 @@ int llama_mtl_eval(
 
     return 0;
 }
+
+float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
+    return ctx->logits;
+}
diff --git a/llama.cpp b/llama.cpp
index 9a8bf9df7b732..93ca233a9fe19 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,9 @@
 #include "llama-util.h"
 #include "llama.h"
 
+// METAL
+#include "examples/mtl/mtl.h"
+
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -238,6 +241,10 @@ struct llama_context {
     llama_ctx_buffer buf_compute;
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
+    // METAL
+    ggml_mtl_context * mtl_ctx = NULL;
+    ggml_cgraph        mtl_gf;
+
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
@@ -836,6 +843,7 @@ struct llama_context_params llama_context_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.embedding                   =*/ false,
+        /*.cgraph                      =*/ false,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
     };
@@ -1270,8 +1278,14 @@ static bool llama_eval_internal(
             //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
             // compute Q and K and RoPE them
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+
+            struct ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
+            struct ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
+            ggml_set_name(Qpre, "Qpre");
+            ggml_set_name(Kpre, "Kpre");
+
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, Qpre, n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, Kpre, n_past, n_rot, 0);
             ggml_set_name(Qcur, "Qcur");
             ggml_set_name(Kcur, "Kcur");
 
@@ -1279,22 +1293,19 @@ static bool llama_eval_internal(
             {
                 // compute the transposed [N, n_embd] V matrix
                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
+                ggml_set_name(Vcur, "Vcur");
 
                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
                         (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
-                //struct ggml_tensor * t = ggml_cpy(ctx0, Vcur, v);
-                //// TODO: TMP !!!!
-                //if (il == 0) {
-                //    ggml_set_name(t, "mtl-check");
-                //}
+                ggml_set_name(k, "k");
+                ggml_set_name(v, "v");
 
                 // important: storing RoPE-ed version of K in the KV cache!
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-                //ggml_build_forward_expand(&gf, t);
             }
 
             struct ggml_tensor * Q =
@@ -2391,9 +2402,25 @@ struct llama_context * llama_init_from_file(
 
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
+    // METAL
+    if (params.cgraph) {
+        params.vocab_only = true;
+
+        // load the compute graph
+        struct ggml_context * ctx_data = NULL;
+        struct ggml_context * ctx_eval = NULL;
+
+        struct ggml_cgraph gf = ggml_graph_import("llama.ggml", &ctx_data, &ctx_eval);
+        gf.n_threads = 1;
+
+        // this allocates all Metal resources and memory buffers
+        ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, NULL, &gf);
+        ctx->mtl_gf  = gf;
+    }
+
     if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
-                          params.use_mmap, params.use_mlock, params.vocab_only,
-                          params.progress_callback, params.progress_callback_user_data)) {
+                params.use_mmap, params.use_mlock, params.vocab_only,
+                params.progress_callback, params.progress_callback_user_data)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
         llama_free(ctx);
         return nullptr;
@@ -2411,7 +2438,11 @@ struct llama_context * llama_init_from_file(
             const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
             fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
+    }
 
+    // METAL
+    // TODO: changed the behavior here for vocab_only -- reconsider implications later
+    {
         const auto & hparams = ctx->model.hparams;
 
         // resized during inference
@@ -3046,9 +3077,25 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
-        fprintf(stderr, "%s: failed to eval\n", __func__);
-        return 1;
+    // METAL
+    if (ctx->mtl_ctx) {
+        llama_mtl_eval(ctx->mtl_ctx, &ctx->mtl_gf, tokens, n_tokens, n_past);
+
+        const float * logits = llama_mtl_get_logits(ctx->mtl_ctx);
+
+        // extract logits
+        {
+            const int n_vocab = ctx->model.hparams.n_vocab;
+            auto & logits_out = ctx->logits;
+
+            logits_out.resize(n_vocab);
+            memcpy(logits_out.data(), logits, sizeof(float)*n_vocab);
+        }
+    } else {
+        if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
+            fprintf(stderr, "%s: failed to eval\n", __func__);
+            return 1;
+        }
     }
 
     // get a more accurate load time, upon first eval
diff --git a/llama.h b/llama.h
index 3ba0775bd8a38..faaca2637726d 100644
--- a/llama.h
+++ b/llama.h
@@ -75,6 +75,7 @@ extern "C" {
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
+        bool cgraph;     // try to load computation graph from "llama.ggml" (METAL)
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;

From 4df2ef316195ef96c495993115658b0beb0af411 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Jun 2023 09:11:15 +0300
Subject: [PATCH 40/49] mtl : make it work with main example

Lots of hacks but at least now it generates text
---
 examples/mtl/mtl.cpp |  24 +++-----
 examples/mtl/mtl.h   |  13 ++--
 examples/mtl/mtl.m   | 139 ++++++++++++++++++++++--------------------
 llama.cpp            | 142 ++++++++-----------------------------------
 4 files changed, 118 insertions(+), 200 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index ff1c1f6858a0c..7411ea9325d34 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -24,6 +24,8 @@ int main(int argc, char ** argv) {
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gf.n_threads = 1;
 
+    int32_t n_vocab = 0;
+
     {
         struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab");
         if (t_vocab == NULL) {
@@ -33,7 +35,6 @@ int main(int argc, char ** argv) {
 
         const char * ptr = (const char *) t_vocab->data;
 
-        int32_t n_vocab = 0;
         memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab);
 
         printf("%s: n_vocab = %d\n", __func__, n_vocab);
@@ -49,20 +50,14 @@ int main(int argc, char ** argv) {
         }
     }
 
-    // allocate work context
-    static size_t buf_size = gf.work_size; // TODO
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx_work = ggml_init(params);
-
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = llama_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+    auto * ctx_mtl = llama_mtl_init(
+            ggml_get_mem_buffer(ctx_data),
+            ggml_get_mem_size  (ctx_data),
+            ggml_get_mem_buffer(ctx_eval),
+            ggml_get_mem_size  (ctx_eval),
+            NULL, 0, // cache
+            32*n_vocab*sizeof(float));
 
     // TODO: tmp to match the input used when creating the cgraph
     {
@@ -90,7 +85,6 @@ int main(int argc, char ** argv) {
 
     llama_mtl_free(ctx_mtl);
 
-    ggml_free(ctx_work);
     ggml_free(ctx_data);
     ggml_free(ctx_eval);
 
diff --git a/examples/mtl/mtl.h b/examples/mtl/mtl.h
index f381756d408c7..ff92a6a7baf76 100644
--- a/examples/mtl/mtl.h
+++ b/examples/mtl/mtl.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stddef.h>
+
 struct ggml_context;
 struct ggml_cgraph;
 
@@ -10,10 +12,13 @@ extern "C" {
 struct ggml_mtl_context;
 
 struct ggml_mtl_context * llama_mtl_init(
-        struct ggml_context * ctx_data,
-        struct ggml_context * ctx_eval,
-        struct ggml_context * ctx_work,
-        struct ggml_cgraph  * gf);
+        void   * data_buf,
+        size_t   data_size,
+        void   * eval_buf,
+        size_t   eval_size,
+        void   * cach_buf,
+        size_t   cach_size,
+        size_t   outp_size);
 
 void llama_mtl_free(struct ggml_mtl_context * ctx);
 
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 4ac5dac206a8b..b8fd1c144a8ac 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -11,11 +11,16 @@
 #else
 #define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
 #endif
+//#define mtl_printf(...)
 
 struct ggml_mtl_context {
-    struct ggml_context * ctx_data;
-    struct ggml_context * ctx_eval;
-    struct ggml_context * ctx_work;
+    void   * data_buf;
+    size_t   data_size;
+    void   * eval_buf;
+    size_t   eval_size;
+    void   * cach_buf;
+    size_t   cach_size;
+    size_t   outp_size;
 
     float * logits;
 
@@ -25,6 +30,7 @@
 
     id<MTLBuffer> buffer_data;
     id<MTLBuffer> buffer_eval;
+    id<MTLBuffer> buffer_cach;
 
     id<MTLBuffer> out;
 
@@ -82,17 +88,23 @@
 NSString * const msl_library_llama = @"see mtl.metal";
 
 struct ggml_mtl_context * llama_mtl_init(
-    struct ggml_context * ctx_data,
-    struct ggml_context * ctx_eval,
-    struct ggml_context * ctx_work,
-    struct ggml_cgraph  * gf) {
+                     void   * data_buf,
+                     size_t   data_size,
+                     void   * eval_buf,
+                     size_t   eval_size,
+                     void   * cach_buf,
+                     size_t   cach_size,
+                     size_t   outp_size) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
 
-    ctx->ctx_data = ctx_data;
-    ctx->ctx_eval = ctx_eval;
-    ctx->ctx_work = ctx_work;
+    ctx->data_buf  = data_buf;
+    ctx->data_size = data_size;
+    ctx->eval_buf  = eval_buf;
+    ctx->eval_size = eval_size;
+    ctx->cach_buf  = cach_buf;
+    ctx->cach_size = cach_size;
 
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
@@ -208,9 +220,10 @@
     // TODO: how to use MTLStorageModeManaged?
     // TODO: see if we can avoid this copy somehow
     {
-        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
-        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+        void * mem_buffer = data_buf;
+        const size_t mem_size   = data_size;
 
+        //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
         ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
 
         fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
@@ -219,16 +232,26 @@
     // pin ctx_eval memory to GPU
     // this buffer will be used for the intermediate results of the evaluation
     {
-        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+        const void * mem_buffer = eval_buf;
+        const size_t mem_size   = eval_size;
 
-        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
+        ctx->buffer_eval = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
 
         fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
     }
 
+    if (cach_buf) {
+        const void * mem_buffer = cach_buf;
+        const size_t mem_size   = cach_size;
+
+        ctx->buffer_cach = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated cach buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
+    }
+
     // allocate buffer for result extraction
     {
-        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
+        const size_t mem_size = outp_size;
 
         ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
 
@@ -246,30 +269,48 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
 
 // get data / eval buffer + offset
 id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
-
-    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
+    const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
 
     //const size_t t_size = ggml_nbytes(t);
-    const size_t t_offs = is_data ? offs_data : offs_eval;
 
     id<MTLBuffer> result;
+    size_t t_offs = 0;
 
-    if (is_data) {
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+    if ( offs_data > 0 &&
+        (offs_eval < 0 || (offs_data < offs_eval)) &&
+        (offs_cach < 0 || (offs_data < offs_cach))
+       ) {
         result = ctx->buffer_data;
-    } else {
-        //fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        t_offs = offs_data;
+        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+    }
+
+    if ( offs_eval > 0 &&
+        (offs_data < 0 || (offs_eval < offs_data)) &&
+        (offs_cach < 0 || (offs_eval < offs_cach))
+       ) {
         result = ctx->buffer_eval;
+        t_offs = offs_eval;
+        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
     }
 
-    if (result == nil) {
+    if ( offs_cach > 0 &&
+        (offs_data < 0 || (offs_cach < offs_data)) &&
+        (offs_eval < 0 || (offs_cach < offs_eval))
+       ) {
+        result = ctx->buffer_cach;
+        t_offs = offs_cach;
+        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+    }
+
+    if (result == nil || (t_offs > ctx->data_size && t_offs > ctx->eval_size && t_offs > ctx->cach_size)) {
         fprintf(stderr, "%s: error: buffer is nil\n", __func__);
         GGML_ASSERT(false);
     }
 
-    if (offs != nil) {
+    if (offs != 0) {
         *offs = t_offs;
     }
 
@@ -284,49 +325,9 @@ int llama_mtl_eval(
                             int   n_past) {
     mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
 
-    // adjust dynamic shapes
-    // TODO: wrong ...
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "embd");
-    //    t->ne[0] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Qpre");
-    //    t->src0->ne[2] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Kpre");
-    //    t->src0->ne[2] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Vcur");
-    //    t->ne[0] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * k = ggml_graph_get_tensor(gf, "k");
-    //    struct ggml_tensor * v = ggml_graph_get_tensor(gf, "v");
-    //    k->ne[0] = n_tokens*v->ne[1];
-    //    v->ne[0] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "Q");
-    //    t->ne[1] = n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "K");
-    //    t->ne[1] = n_past + n_tokens;
-    //}
-    //{
-    //    struct ggml_tensor * t = ggml_graph_get_tensor(gf, "KQV_merged_contiguous");
-    //    t->src1->ne[1] = n_tokens;
-    //}
-
     struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
     memcpy(input->data, tokens, n_tokens * sizeof(int));
 
-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = nil;
-
     size_t offs_src0 = 0;
     size_t offs_src1 = 0;
     size_t offs_dst  = 0;
@@ -340,6 +341,9 @@ int llama_mtl_eval(
         memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
     }
 
+    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = nil;
+
     for (int i = 0; i < gf->n_nodes; ++i) {
         //mtl_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
@@ -791,6 +795,9 @@ int llama_mtl_eval(
 
     const float * logits = ctx->logits;
 
+    struct ggml_tensor * t = gf->nodes[gf->n_nodes - 1];
+    memcpy(t->data, logits, ggml_nbytes(t));
+
 #if 1
     mtl_printf("logits: ");
     for (int i = 0; i < 100; i++) {
diff --git a/llama.cpp b/llama.cpp
index 93ca233a9fe19..24b9d633b1303 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -243,7 +243,6 @@ struct llama_context {
 
     // METAL
     ggml_mtl_context * mtl_ctx = NULL;
-    ggml_cgraph        mtl_gf;
 
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -1262,7 +1261,7 @@ static bool llama_eval_internal(
 
         struct ggml_tensor * cur;
 
-        lctx.use_buf(ctx0, 0);
+        //lctx.use_buf(ctx0, 0);
 
         // norm
         {
@@ -1378,7 +1377,7 @@ static bool llama_eval_internal(
                     cur);
         }
 
-        lctx.use_buf(ctx0, 1);
+        //lctx.use_buf(ctx0, 1);
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
 
@@ -1416,7 +1415,7 @@ static bool llama_eval_internal(
         inpL = cur;
     }
 
-    lctx.use_buf(ctx0, 0);
+    //lctx.use_buf(ctx0, 0);
 
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
@@ -1435,85 +1434,20 @@ static bool llama_eval_internal(
     // lm_head
     inpL = ggml_mul_mat(ctx0, model.output, inpL);
 
-    lctx.use_buf(ctx0, -1);
+    //lctx.use_buf(ctx0, -1);
 
     // logits -> probs
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
-
-    // TODO: not needed anymore, keeping for a bit
-    //// lets export a smaller graph to get things rolling -- baby steps first
-    //{
-    //    struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
-    //    if (!t) {
-    //        fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
-    //        exit(1);
-    //    }
-    //    ggml_build_forward_expand(&gf, t);
-    //}
 
-    // print
-    //{
-    //    auto print_t_f32 = [&](struct ggml_tensor * t) {
-    //        float * data = (float *)t->data;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", data[i]);
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        for (int i = 0; i < ggml_nelements(t); i++) {
-    //            double cur = data[i];
-    //            if (isinf(cur)) continue;
-    //            sum += data[i];
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    };
-    //    auto print_t_f16 = [&](struct ggml_tensor * t) {
-    //        ggml_fp16_t * data = (ggml_fp16_t *)t->data;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-    //                        const float curf = ggml_fp16_to_fp32(cur);
-    //                        if (isinf(curf)) continue;
-    //                        sum += curf;
-    //                    }
-    //                }
-    //            }
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    };
-
-    //    ggml_graph_compute(ctx0, &gf);
-
-    //    {
-    //        auto * t = ggml_get_tensor(ctx0, "mtl-check");
-    //        switch (t->type) {
-    //            case GGML_TYPE_F32:
-    //                print_t_f32(t);
-    //                break;
-    //            case GGML_TYPE_F16:
-    //                print_t_f16(t);
-    //                break;
-    //            default:
-    //                fprintf(stderr, "%s: unsupported type\n", __func__);
-    //                exit(1);
-    //        }
-    //    }
-    //}
+    // METAL
+    if (lctx.mtl_ctx) {
+        llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+    } else {
+        ggml_graph_compute (ctx0, &gf);
+    }
 
     if (cgraph_fname) {
         // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
@@ -2402,22 +2336,6 @@ struct llama_context * llama_init_from_file(
 
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    // METAL
-    if (params.cgraph) {
-        params.vocab_only = true;
-
-        // load the compute graph
-        struct ggml_context * ctx_data = NULL;
-        struct ggml_context * ctx_eval = NULL;
-
-        struct ggml_cgraph gf = ggml_graph_import("llama.ggml", &ctx_data, &ctx_eval);
-        gf.n_threads = 1;
-
-        // this allocates all Metal resources and memory buffers
-        ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, NULL, &gf);
-        ctx->mtl_gf  = gf;
-    }
-
     if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
                 params.use_mmap, params.use_mlock, params.vocab_only,
                 params.progress_callback, params.progress_callback_user_data)) {
@@ -2438,11 +2356,7 @@ struct llama_context * llama_init_from_file(
             const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
             fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
-    }
 
-    // METAL
-    // TODO: changed the behavior here for vocab_only -- reconsider implications later
-    {
         const auto & hparams = ctx->model.hparams;
 
         // resized during inference
@@ -2462,6 +2376,20 @@ struct llama_context * llama_init_from_file(
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
     }
 
+    // METAL
+    if (params.cgraph) {
+        // this allocates all Metal resources and memory buffers
+        //ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf);
+        ctx->mtl_ctx = llama_mtl_init(
+                ggml_get_mem_buffer(ctx->model.ctx),
+                ggml_get_mem_size  (ctx->model.ctx),
+                ctx->buf_compute.addr,
+                ctx->buf_compute.size,
+                ctx->model.kv_self.buf.addr,
+                ctx->model.kv_self.buf.size,
+                32*ctx->model.hparams.n_vocab*sizeof(float));
+    }
+
     return ctx;
 }
 
@@ -3077,25 +3005,9 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    // METAL
-    if (ctx->mtl_ctx) {
-        llama_mtl_eval(ctx->mtl_ctx, &ctx->mtl_gf, tokens, n_tokens, n_past);
-
-        const float * logits = llama_mtl_get_logits(ctx->mtl_ctx);
-
-        // extract logits
-        {
-            const int n_vocab = ctx->model.hparams.n_vocab;
-            auto & logits_out = ctx->logits;
-
-            logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), logits, sizeof(float)*n_vocab);
-        }
-    } else {
-        if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
-            fprintf(stderr, "%s: failed to eval\n", __func__);
-            return 1;
-        }
+    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
+        return 1;
     }
 
     // get a more accurate load time, upon first eval

From 18e482a89c3bd5513235b2a98998111e5fe177a2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 09:27:27 +0300
Subject: [PATCH 41/49] mtl : preparing for merge

---
 .gitignore                               |  1 +
 CMakeLists.txt                           | 61 +++++++++++------
 examples/CMakeLists.txt                  |  6 +-
 examples/common.cpp                      |  4 --
 examples/common.h                        |  1 -
 examples/mtl/CMakeLists.txt              | 31 +--------
 examples/mtl/mtl.cpp                     | 10 +--
 examples/mtl/mtl.h => ggml-mtl.h         |  8 +--
 examples/mtl/mtl.m => ggml-mtl.m         | 83 ++++++------------------
 examples/mtl/mtl.metal => ggml-mtl.metal |  0
 ggml.h                                   |  2 +-
 llama.cpp                                | 28 ++++----
 llama.h                                  |  3 +-
 13 files changed, 94 insertions(+), 144 deletions(-)
 rename examples/mtl/mtl.h => ggml-mtl.h (77%)
 rename examples/mtl/mtl.m => ggml-mtl.m (92%)
 rename examples/mtl/mtl.metal => ggml-mtl.metal (100%)

diff --git a/.gitignore b/.gitignore
index d231f3ff8ed36..edcb6b1443319 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ build-release/
 build-static/
 build-cublas/
 build-opencl/
+build-mtl/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc23c2c5b5d80..62f1467aa03e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,13 +64,14 @@ if (NOT MSVC)
 endif()
 
 # 3rd party libs
-option(LLAMA_ACCELERATE                 "llama: enable Accelerate framework"                    ON)
-option(LLAMA_BLAS                       "llama: use BLAS"                                       OFF)
+option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                     "llama: use cuBLAS"                                     OFF)
-set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING  "llama: y block size for dmmv CUDA kernels")
-option(LLAMA_CLBLAST                    "llama: use CLBlast"                                    OFF)
+option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
 
         enable_language(CUDA)
 
-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
@@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-mtl.metal to bin directory
+    configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+        )
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
         message(STATUS "CLBlast found")
 
-        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
 
         add_compile_definitions(GGML_USE_CLBLAST)
 
@@ -370,8 +396,10 @@ endif()
 add_library(ggml OBJECT
             ggml.c
             ggml.h
-            ${GGML_CUDA_SOURCES}
-            ${GGML_OPENCL_SOURCES})
+            ${GGML_SOURCES_CUDA}
+            ${GGML_SOURCES_OPENCL}
+            ${GGML_SOURCES_METAL}
+            )
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
@@ -385,8 +413,6 @@ add_library(llama
             llama.cpp
             llama.h
             llama-util.h
-            examples/mtl/mtl.h # TODO: METAL TMP
-            examples/mtl/mtl.m # TODO: METAL TMP
             )
 
 target_include_directories(llama PUBLIC .)
@@ -394,22 +420,17 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}
-    ${FOUNDATION_LIBRARY}         # TODO: METAL TMP
-    ${METAL_FRAMEWORK}            # TODO: METAL TMP
-    ${METALKIT_FRAMEWORK}         # TODO: METAL TMP
-    ${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
     )
-target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
 
-if (GGML_CUDA_SOURCES)
+if (GGML_SOURCES_CUDA)
     message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
-    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
     set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
 endif()
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 97a3ffd1b6db7..e23bf1cb3d9f2 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,8 +37,10 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
-    add_subdirectory(mtl)
-    if(LLAMA_BUILD_SERVER)
+    if (LLAMA_METAL)
+        add_subdirectory(mtl)
+    endif()
+    if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
 endif()
diff --git a/examples/common.cpp b/examples/common.cpp
index 53e9200fa90c4..b5810f28f4901 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
-        } else if (arg == "--import") {
-            params.import_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-    fprintf(stderr, "  --import              import a computation graph from 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
-    lparams.cgraph       = params.import_cgraph;
 
     llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
 
diff --git a/examples/common.h b/examples/common.h
index c7d4d6e0e7bdf..66bdeb5e9287d 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -72,7 +72,6 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
-    bool import_cgraph     = false; // import a computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
index 1de83a1b62fd4..0fe3a7197faf8 100644
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@@ -1,33 +1,6 @@
 if (APPLE)
-    #
-    # mtl
-
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
-
     set(TEST_TARGET mtl)
-    add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
-    target_link_libraries(${TEST_TARGET} PRIVATE
-        ggml
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
-    )
-
-    # TODO: temporary until the kernels are ready
-    # custom command to build mtl.metal into a library
-    # depends on the mtl.metal file
-    add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
-
-    add_custom_command(
-        OUTPUT  ${CMAKE_BINARY_DIR}/mtl.metallib
-        COMMAND xcrun -sdk macosx metal    -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
-        COMMAND xcrun -sdk macosx metallib            ${CMAKE_BINARY_DIR}/mtl.air   -o ${CMAKE_BINARY_DIR}/mtl.metallib
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
-        COMMENT "Building mtl.metallib"
-    )
+    add_executable(${TEST_TARGET} mtl.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 endif()
 
diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 7411ea9325d34..38297666792a2 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "mtl.h"
+#include "ggml-mtl.h"
 
 #include <cstdio>
 #include <cstring>
@@ -51,7 +51,7 @@ int main(int argc, char ** argv) {
     }
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = llama_mtl_init(
+    auto * ctx_mtl = ggml_mtl_init(
             ggml_get_mem_buffer(ctx_data),
             ggml_get_mem_size  (ctx_data),
             ggml_get_mem_buffer(ctx_eval),
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
         const std::vector<int> tmp(n_batch, 1); // BOS
 
         // warmup
-        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
 
         const int n_iter = 16;
 
@@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+            ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
         printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
     }
 
-    llama_mtl_free(ctx_mtl);
+    ggml_mtl_free(ctx_mtl);
 
     ggml_free(ctx_data);
     ggml_free(ctx_eval);
diff --git a/examples/mtl/mtl.h b/ggml-mtl.h
similarity index 77%
rename from examples/mtl/mtl.h
rename to ggml-mtl.h
index ff92a6a7baf76..15256b27db15a 100644
--- a/examples/mtl/mtl.h
+++ b/ggml-mtl.h
@@ -11,7 +11,7 @@ extern "C" {
 
 struct ggml_mtl_context;
 
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
         void   * data_buf,
         size_t   data_size,
         void   * eval_buf,
@@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init(
         size_t   cach_size,
         size_t   outp_size);
 
-void llama_mtl_free(struct ggml_mtl_context * ctx);
+void ggml_mtl_free(struct ggml_mtl_context * ctx);
 
 // return 0 on success
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
              struct ggml_cgraph * gf,
                       const int * tokens,
                             int   n_tokens,
                             int   n_past);
 
-float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/examples/mtl/mtl.m b/ggml-mtl.m
similarity index 92%
rename from examples/mtl/mtl.m
rename to ggml-mtl.m
index b8fd1c144a8ac..ecbb1a18853ab 100644
--- a/examples/mtl/mtl.m
+++ b/ggml-mtl.m
@@ -1,4 +1,4 @@
-#import "mtl.h"
+#import "ggml-mtl.h"
 
 #import "ggml.h"
 
@@ -6,7 +6,7 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
-#ifdef LLAMA_MTL_NDEBUG
+#ifdef GGML_METAL_NDEBUG
 #define mtl_printf(...)
 #else
 #define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
@@ -85,9 +85,9 @@
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-NSString * const msl_library_llama = @"see mtl.metal";
+NSString * const msl_library_source = @"see mtl.metal";
 
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
                      void   * data_buf,
                      size_t   data_size,
                      void   * eval_buf,
@@ -122,7 +122,7 @@
     {
         NSError * error = nil;
 
-        ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
             exit(1);
@@ -133,7 +133,10 @@
     {
         NSError * error = nil;
 
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
+        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
+
         NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -220,7 +223,7 @@
     // TODO: how to use MTLStorageModeManaged?
     // TODO: see if we can avoid this copy somehow
     {
-        void * mem_buffer = data_buf;
+        const void * mem_buffer = data_buf;
         const size_t mem_size   = data_size;
 
         //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
@@ -261,18 +264,20 @@
     return ctx;
 }
 
-void llama_mtl_free(struct ggml_mtl_context * ctx) {
+void ggml_mtl_free(struct ggml_mtl_context * ctx) {
     fprintf(stderr, "%s: deallocating\n", __func__);
 
     free(ctx);
 }
 
 // get data / eval buffer + offset
-id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
     const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
     const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
     const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
 
+    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+
     //const size_t t_size = ggml_nbytes(t);
 
     id<MTLBuffer> result;
@@ -317,7 +322,7 @@ void llama_mtl_free(struct ggml_mtl_context * ctx) {
     return result;
 }
 
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
              struct ggml_cgraph * gf,
                       const int * tokens,
@@ -336,7 +341,7 @@ int llama_mtl_eval(
     {
         struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
 
-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
+        id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
 
         memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
     }
@@ -385,9 +390,9 @@ int llama_mtl_eval(
         const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
         const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
 
-        id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
-        id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
-        id<MTLBuffer> id_dst  = dst  ? llama_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
+        id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
+        id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
+        id<MTLBuffer> id_dst  = dst  ? ggml_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
 
         //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
         //if (src0) {
@@ -775,7 +780,7 @@ int llama_mtl_eval(
 
         struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
 
-        id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
         id<MTLBuffer> id_dst = ctx->out;
 
         id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
@@ -817,53 +822,5 @@ int llama_mtl_eval(
     mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
 #endif
 
-    //{
-    //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
-    //    if (t->type == GGML_TYPE_F32) {
-    //        const const float * data = (float *) ctx->out.contents;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", data[i]);
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        for (int i = 0; i < ggml_nelements(t); i++) {
-    //            double cur = data[i];
-    //            if (isinf(cur)) continue;
-    //            sum += cur;
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    } else if (t->type == GGML_TYPE_F16) {
-    //        ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-    //                        const float curf = ggml_fp16_to_fp32(cur);
-    //                        if (isinf(curf)) continue;
-    //                        sum += curf;
-    //                    }
-    //                }
-    //            }
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    } else {
-    //        GGML_ASSERT(false && "not implemented");
-    //    }
-    //}
-
     return 0;
 }
-
-float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
-    return ctx->logits;
-}
diff --git a/examples/mtl/mtl.metal b/ggml-mtl.metal
similarity index 100%
rename from examples/mtl/mtl.metal
rename to ggml-mtl.metal
diff --git a/ggml.h b/ggml.h
index 7f821cf32e966..2ea87ce9a9749 100644
--- a/ggml.h
+++ b/ggml.h
@@ -451,7 +451,7 @@ extern "C" {
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void    ggml_free(struct ggml_context * ctx);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
diff --git a/llama.cpp b/llama.cpp
index 24b9d633b1303..5cd39b612ff17 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,9 +9,6 @@
 #include "llama-util.h"
 #include "llama.h"
 
-// METAL
-#include "examples/mtl/mtl.h"
-
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -19,6 +16,10 @@
 #include "ggml-opencl.h"
 #endif
 
+#ifdef GGML_USE_METAL
+#include "ggml-mtl.h"
+#endif
+
 #include <array>
 #include <ctime>
 #include <cinttypes>
@@ -241,8 +242,9 @@ struct llama_context {
     llama_ctx_buffer buf_compute;
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
-    // METAL
+#ifdef GGML_USE_METAL
     ggml_mtl_context * mtl_ctx = NULL;
+#endif
 
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.embedding                   =*/ false,
-        /*.cgraph                      =*/ false,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
     };
@@ -1442,12 +1443,15 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
 
-    // METAL
+#ifdef GGML_USE_METAL
     if (lctx.mtl_ctx) {
-        llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
     } else {
-        ggml_graph_compute (ctx0, &gf);
+        ggml_graph_compute(ctx0, &gf);
     }
+#else
+    ggml_graph_compute(ctx0, &gf);
+#endif
 
     if (cgraph_fname) {
         // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
@@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file(
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
     }
 
-    // METAL
-    if (params.cgraph) {
+#ifdef GGML_USE_METAL
+    if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        //ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf);
-        ctx->mtl_ctx = llama_mtl_init(
+        ctx->mtl_ctx = ggml_mtl_init(
                 ggml_get_mem_buffer(ctx->model.ctx),
                 ggml_get_mem_size  (ctx->model.ctx),
                 ctx->buf_compute.addr,
@@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file(
                 ctx->model.kv_self.buf.size,
                 32*ctx->model.hparams.n_vocab*sizeof(float));
     }
+#endif
 
     return ctx;
 }
diff --git a/llama.h b/llama.h
index faaca2637726d..a650ddf4501e8 100644
--- a/llama.h
+++ b/llama.h
@@ -31,7 +31,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@@ -75,7 +75,6 @@ extern "C" {
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
-        bool cgraph;     // try to load computation graph from "llama.ggml" (METAL)
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;

From e4b522232c0381913733e4ec7c914d6df894294e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 10:38:21 +0300
Subject: [PATCH 42/49] mtl : clean-up ggml mtl interface + suport scratch /
 inplace

---
 examples/mtl/mtl.cpp |  48 ++++++---
 ggml-mtl.h           |  34 +++---
 ggml-mtl.m           | 242 +++++++++++++------------------------------
 llama.cpp            |  73 ++++++++-----
 4 files changed, 174 insertions(+), 223 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 38297666792a2..e527f285632bd 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -51,23 +51,22 @@ int main(int argc, char ** argv) {
     }
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = ggml_mtl_init(
-            ggml_get_mem_buffer(ctx_data),
-            ggml_get_mem_size  (ctx_data),
-            ggml_get_mem_buffer(ctx_eval),
-            ggml_get_mem_size  (ctx_eval),
-            NULL, 0, // cache
-            32*n_vocab*sizeof(float));
+    auto * ctx_mtl = ggml_mtl_init();
+
+    ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
+    ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
     // TODO: tmp to match the input used when creating the cgraph
     {
-        const int n_batch = 1;
-        const int n_past  = 512 - n_batch;
+        const std::vector<int> tmp(1, 1); // BOS
+
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
+        memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
 
-        const std::vector<int> tmp(n_batch, 1); // BOS
+        ggml_mtl_set_tensor(ctx_mtl, input);
 
         // warmup
-        ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        ggml_mtl_graph_compute(ctx_mtl, &gf);
 
         const int n_iter = 16;
 
@@ -75,7 +74,7 @@ int main(int argc, char ** argv) {
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+            ggml_mtl_graph_compute(ctx_mtl, &gf);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -83,6 +82,31 @@ int main(int argc, char ** argv) {
         printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
     }
 
+    // debug output
+    {
+        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
+        ggml_mtl_get_tensor(ctx_mtl, logits);
+
+        float * ptr = (float *) ggml_get_data(logits);
+
+        printf("logits: ");
+        for (int i = 0; i < 10; i++) {
+            printf("%8.4f ", ptr[i]);
+        }
+        printf("\n");
+        int imax = 0;
+        double sum = 0.0;
+        double vmax = -1e9;
+        for (int i = 0; i < 32000; i++) {
+            sum += (double) ptr[i];
+            if (ptr[i] > vmax) {
+                vmax = ptr[i];
+                imax = i;
+            }
+        }
+        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+    }
+
     ggml_mtl_free(ctx_mtl);
 
     ggml_free(ctx_data);
diff --git a/ggml-mtl.h b/ggml-mtl.h
index 15256b27db15a..cab71e3862290 100644
--- a/ggml-mtl.h
+++ b/ggml-mtl.h
@@ -2,7 +2,9 @@
 
 #include <stddef.h>
 
-struct ggml_context;
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
 struct ggml_cgraph;
 
 #ifdef __cplusplus
@@ -11,24 +13,30 @@ extern "C" {
 
 struct ggml_mtl_context;
 
-struct ggml_mtl_context * ggml_mtl_init(
-        void   * data_buf,
-        size_t   data_size,
-        void   * eval_buf,
-        size_t   eval_size,
-        void   * cach_buf,
-        size_t   cach_size,
-        size_t   outp_size);
+struct ggml_mtl_context * ggml_mtl_init(void);
 
 void ggml_mtl_free(struct ggml_mtl_context * ctx);
 
+void ggml_mtl_add_buffer(
+        struct ggml_mtl_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size);
+
+// set data from host memory into the device
+void ggml_mtl_set_tensor(
+        struct ggml_mtl_context * ctx,
+             struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_mtl_get_tensor(
+        struct ggml_mtl_context * ctx,
+             struct ggml_tensor * t);
+
 // return 0 on success
 int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
-             struct ggml_cgraph * gf,
-                      const int * tokens,
-                            int   n_tokens,
-                            int   n_past);
+             struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }
diff --git a/ggml-mtl.m b/ggml-mtl.m
index ecbb1a18853ab..8f831afe7f13a 100644
--- a/ggml-mtl.m
+++ b/ggml-mtl.m
@@ -13,26 +13,24 @@
 #endif
 //#define mtl_printf(...)
 
-struct ggml_mtl_context {
-    void   * data_buf;
-    size_t   data_size;
-    void   * eval_buf;
-    size_t   eval_size;
-    void   * cach_buf;
-    size_t   cach_size;
-    size_t   outp_size;
+struct ggml_mtl_buffer {
+    const char * name;
+
+    void   * data;
+    size_t   size;
 
+    id<MTLBuffer> mtl;
+};
+
+struct ggml_mtl_context {
     float * logits;
 
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
-    id<MTLBuffer> buffer_data;
-    id<MTLBuffer> buffer_eval;
-    id<MTLBuffer> buffer_cach;
-
-    id<MTLBuffer> out;
+    int n_buffers;
+    struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
     // custom kernels
     id<MTLFunction>             function_add;
@@ -87,25 +85,11 @@
 //       for now it is easier to work in a separate file
 NSString * const msl_library_source = @"see mtl.metal";
 
-struct ggml_mtl_context * ggml_mtl_init(
-                     void   * data_buf,
-                     size_t   data_size,
-                     void   * eval_buf,
-                     size_t   eval_size,
-                     void   * cach_buf,
-                     size_t   cach_size,
-                     size_t   outp_size) {
+struct ggml_mtl_context * ggml_mtl_init(void) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
 
-    ctx->data_buf  = data_buf;
-    ctx->data_size = data_size;
-    ctx->eval_buf  = eval_buf;
-    ctx->eval_size = eval_size;
-    ctx->cach_buf  = cach_buf;
-    ctx->cach_size = cach_size;
-
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
 
@@ -216,51 +200,6 @@
         fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
     }
 
-    // MTLBuffer approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = data_buf;
-        const size_t mem_size   = data_size;
-
-        //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
-        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this buffer will be used for the intermediate results of the evaluation
-    {
-        const void * mem_buffer = eval_buf;
-        const size_t mem_size   = eval_size;
-
-        ctx->buffer_eval = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    if (cach_buf) {
-        const void * mem_buffer = cach_buf;
-        const size_t mem_size   = cach_size;
-
-        ctx->buffer_cach = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated cach buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    // allocate buffer for result extraction
-    {
-        const size_t mem_size = outp_size;
-
-        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated  out buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
     return ctx;
 }
 
@@ -271,81 +210,80 @@ void ggml_mtl_free(struct ggml_mtl_context * ctx) {
 }
 
 // get data / eval buffer + offset
-id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
-    const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
-
+static id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
     //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
-    //const size_t t_size = ggml_nbytes(t);
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
 
-    id<MTLBuffer> result;
-    size_t t_offs = 0;
+        if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
 
-    if ( offs_data > 0 &&
-        (offs_eval < 0 || (offs_data < offs_eval)) &&
-        (offs_cach < 0 || (offs_data < offs_cach))
-       ) {
-        result = ctx->buffer_data;
-        t_offs = offs_data;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-    }
+            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
 
-    if ( offs_eval > 0 &&
-        (offs_data < 0 || (offs_eval < offs_data)) &&
-        (offs_cach < 0 || (offs_eval < offs_cach))
-       ) {
-        result = ctx->buffer_eval;
-        t_offs = offs_eval;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+            return ctx->buffers[i].mtl;
+        }
     }
 
-    if ( offs_cach > 0 &&
-        (offs_data < 0 || (offs_cach < offs_data)) &&
-        (offs_eval < 0 || (offs_cach < offs_eval))
-       ) {
-        result = ctx->buffer_cach;
-        t_offs = offs_cach;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-    }
+    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+    GGML_ASSERT(false);
+
+    return nil;
+}
 
-    if (result == nil || (t_offs > ctx->data_size && t_offs > ctx->eval_size && t_offs > ctx->cach_size)) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
+void ggml_mtl_add_buffer(
+        struct ggml_mtl_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size) {
+    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
+        fprintf(stderr, "%s: too many buffers\n", __func__);
+        return;
     }
 
-    if (offs != 0) {
-        *offs = t_offs;
+    if (data) {
+        ctx->buffers[ctx->n_buffers].name = name;
+        ctx->buffers[ctx->n_buffers].data = data;
+        ctx->buffers[ctx->n_buffers].size = size;
+        ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
+
+        ++ctx->n_buffers;
+
+        fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
     }
+}
+
+void ggml_mtl_set_tensor(
+        struct ggml_mtl_context * ctx,
+        struct ggml_tensor * t) {
+    mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name);
 
-    return result;
+    size_t offs;
+    id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, t, &offs);
+
+    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
 }
 
-int ggml_mtl_graph_compute(
+void ggml_mtl_get_tensor(
         struct ggml_mtl_context * ctx,
-             struct ggml_cgraph * gf,
-                      const int * tokens,
-                            int   n_tokens,
-                            int   n_past) {
-    mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
+        struct ggml_tensor * t) {
+    mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
 
-    struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
-    memcpy(input->data, tokens, n_tokens * sizeof(int));
+    size_t offs;
+    id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, t, &offs);
+
+    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
+}
+
+int ggml_mtl_graph_compute(
+        struct ggml_mtl_context * ctx,
+             struct ggml_cgraph * gf) {
+    mtl_printf("%s: evaluating graph\n", __func__);
 
     size_t offs_src0 = 0;
     size_t offs_src1 = 0;
     size_t offs_dst  = 0;
 
-    // copy the input data to the GPU
-    {
-        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
-
-        id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
-
-        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
-    }
-
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
     id<MTLComputeCommandEncoder> encoder = nil;
 
@@ -521,6 +459,8 @@ int ggml_mtl_graph_compute(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
+                    const int n_past = ((int32_t *)(src1->data))[0];
+
                     [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -690,6 +630,8 @@ int ggml_mtl_graph_compute(
                     //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
                     //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
 
+                    const int n_past = ((int32_t *)(src1->data))[0];
+
                     [encoder setComputePipelineState:ctx->pipeline_rope];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -769,23 +711,9 @@ int ggml_mtl_graph_compute(
         }
     }
 
-    // extract results from the GPU
-    {
-        mtl_printf("%s: extract results from the GPU\n", __func__);
-
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
-
-        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
-
-        id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
-        id<MTLBuffer> id_dst = ctx->out;
-
-        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
-        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
-        [encoder_blit endEncoding];
+    if (encoder != nil) {
+        [encoder endEncoding];
+        encoder = nil;
     }
 
     [command_buffer commit];
@@ -796,31 +724,5 @@ int ggml_mtl_graph_compute(
         mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
     }
 
-    ctx->logits = ctx->out.contents;
-
-    const float * logits = ctx->logits;
-
-    struct ggml_tensor * t = gf->nodes[gf->n_nodes - 1];
-    memcpy(t->data, logits, ggml_nbytes(t));
-
-#if 1
-    mtl_printf("logits: ");
-    for (int i = 0; i < 100; i++) {
-        mtl_printf("%8.4f ", logits[i]);
-    }
-    mtl_printf("\n");
-    double sum = 0.0;
-    int imax = 0;
-    double vmax = -INFINITY;
-    for (int i = 0; i < 32000; i++) {
-        sum += (double) logits[i];
-        if (logits[i] > vmax) {
-            vmax = logits[i];
-            imax = i;
-        }
-    }
-    mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
-#endif
-
     return 0;
 }
diff --git a/llama.cpp b/llama.cpp
index 5cd39b612ff17..26722e0918f5a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1085,7 +1085,7 @@ static void llama_model_load_internal(
             mmapped_size - vram_total + // weights in VRAM not in memory
             MEM_REQ_SCRATCH0().at(model.type) +
             MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
+            MEM_REQ_EVAL().at    (model.type);
 
         // this is the memory required by one llama_state
         const size_t mem_required_state =
@@ -1255,14 +1255,19 @@ static bool llama_eval_internal(
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
+#ifdef GGML_USE_METAL
+    if (lctx.mtl_ctx) {
+        ggml_mtl_set_tensor(lctx.mtl_ctx, embd);
+    }
+#endif
+
+    struct ggml_tensor * cur;
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
-        struct ggml_tensor * cur;
-
-        //lctx.use_buf(ctx0, 0);
+        lctx.use_buf(ctx0, 0);
 
         // norm
         {
@@ -1378,7 +1383,7 @@ static bool llama_eval_internal(
                     cur);
         }
 
-        //lctx.use_buf(ctx0, 1);
+        lctx.use_buf(ctx0, 1);
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
 
@@ -1416,36 +1421,36 @@ static bool llama_eval_internal(
         inpL = cur;
     }
 
-    //lctx.use_buf(ctx0, 0);
+    lctx.use_buf(ctx0, 0);
 
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
 
     // norm
     {
+        cur = ggml_rms_norm(ctx0, inpL);
 
-        inpL = ggml_rms_norm(ctx0, inpL);
+        // cur = cur*norm(broadcasted)
+        cur = ggml_mul(ctx0, cur, model.norm);
 
-        // inpL = inpL*norm(broadcasted)
-        inpL = ggml_mul(ctx0, inpL, model.norm);
-
-        embeddings = inpL;
+        embeddings = cur;
     }
 
     // lm_head
-    inpL = ggml_mul_mat(ctx0, model.output, inpL);
+    cur = ggml_mul_mat(ctx0, model.output, cur);
 
-    //lctx.use_buf(ctx0, -1);
+    lctx.use_buf(ctx0, -1);
 
     // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+    //cur = ggml_soft_max_inplace(ctx0, cur);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
+    ggml_build_forward_expand(&gf, cur);
 
 #ifdef GGML_USE_METAL
     if (lctx.mtl_ctx) {
-        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf);
+        ggml_mtl_get_tensor(lctx.mtl_ctx, cur);
     } else {
         ggml_graph_compute(ctx0, &gf);
     }
@@ -1498,7 +1503,7 @@ static bool llama_eval_internal(
             ggml_free(ctx_vocab);
         }
 
-        float * logits = (float *) ggml_get_data(inpL);
+        float * logits = (float *) ggml_get_data(cur);
 
         printf("logits: ");
         for (int i = 0; i < 10; i++) {
@@ -1530,7 +1535,7 @@ static bool llama_eval_internal(
     //}
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
 
     // update kv token count
     lctx.model.kv_self.n = n_past + N;
@@ -1541,11 +1546,11 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -2374,7 +2379,12 @@ struct llama_context * llama_init_from_file(
             ctx->embedding.resize(hparams.n_embd);
         }
 
+#ifdef GGML_USE_METAL
+        // when using Metal, we don't need the extra buffer for intermediate dequantization
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100);
+#else
         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+#endif
 
         ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@@ -2383,14 +2393,21 @@ struct llama_context * llama_init_from_file(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->mtl_ctx = ggml_mtl_init(
-                ggml_get_mem_buffer(ctx->model.ctx),
-                ggml_get_mem_size  (ctx->model.ctx),
-                ctx->buf_compute.addr,
-                ctx->buf_compute.size,
-                ctx->model.kv_self.buf.addr,
-                ctx->model.kv_self.buf.size,
-                32*ctx->model.hparams.n_vocab*sizeof(float));
+        if (params.use_mmap) {
+            ctx->mtl_ctx = ggml_mtl_init();
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr,    ctx->model.mapping->size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
+        } else {
+            ctx->mtl_ctx = ggml_mtl_init();
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,               ctx->buf_compute.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr,         ctx->model.kv_self.buf.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,            ctx->buf_scratch[0].size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,            ctx->buf_scratch[1].size);
+        }
     }
 #endif
 

From e26cd6b483c195a45c64e2f25c315269ada827a2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 11:23:36 +0300
Subject: [PATCH 43/49] mtl : remove temp / debug code

---
 examples/mtl/mtl.cpp | 34 ++----------------------
 llama.cpp            | 63 +-------------------------------------------
 2 files changed, 3 insertions(+), 94 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index e527f285632bd..56510904c32df 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -5,8 +5,6 @@
 #include <cstring>
 #include <cstdlib>
 
-#include <vector> // tmp
-
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -24,44 +22,16 @@ int main(int argc, char ** argv) {
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gf.n_threads = 1;
 
-    int32_t n_vocab = 0;
-
-    {
-        struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab");
-        if (t_vocab == NULL) {
-            fprintf(stderr, "%s: vocab tensor not found\n", __func__);
-            return -1;
-        }
-
-        const char * ptr = (const char *) t_vocab->data;
-
-        memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab);
-
-        printf("%s: n_vocab = %d\n", __func__, n_vocab);
-
-        for (int i = 0; i < 512; ++i) {
-            char text[32];
-            float score;
-
-            memcpy(text,   ptr, sizeof(text));  ptr += sizeof(text);
-            memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score);
-
-            printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score);
-        }
-    }
-
     // this allocates all Metal resources and memory buffers
     auto * ctx_mtl = ggml_mtl_init();
 
     ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
     ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
-    // TODO: tmp to match the input used when creating the cgraph
+    // main
     {
-        const std::vector<int> tmp(1, 1); // BOS
-
         struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
-        memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
+        *(int32_t *) input->data = 1; // BOS
 
         ggml_mtl_set_tensor(ctx_mtl, input);
 
diff --git a/llama.cpp b/llama.cpp
index 26722e0918f5a..455402a4e0b7a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1459,68 +1459,7 @@ static bool llama_eval_internal(
 #endif
 
     if (cgraph_fname) {
-        // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
-        {
-            char tmp[32]; // max token length
-
-            // store null-terminated string for simplicity
-            std::vector<uint8_t> buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float)));
-
-            uint64_t offs = 0;
-
-            {
-                const int32_t n = n_vocab;
-                memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n);
-            }
-
-            for (int i = 0; i < n_vocab; i++) {
-                const int32_t id = i;
-
-                const float score = lctx.vocab.id_to_token[id].score;
-                const std::string text = lctx.vocab.id_to_token[id].tok;
-
-                snprintf(tmp, sizeof(tmp), "%s", text.c_str());
-
-                memcpy(&buf_vocab[offs], tmp, 32); offs += 32;
-                memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score);
-            }
-
-            struct ggml_init_params params;
-            params.mem_size   = ggml_tensor_overhead();
-            params.mem_buffer = NULL;
-            params.no_alloc   = true;
-
-            ggml_context * ctx_vocab = ggml_init(params);
-
-            struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size());
-            t_vocab->data = buf_vocab.data();
-            ggml_set_name(t_vocab, "vocab");
-
-            gf.leafs[gf.n_leafs++] = t_vocab;
-
-            ggml_graph_export(&gf, cgraph_fname);
-
-            ggml_free(ctx_vocab);
-        }
-
-        float * logits = (float *) ggml_get_data(cur);
-
-        printf("logits: ");
-        for (int i = 0; i < 10; i++) {
-            printf("%8.4f ", logits[i]);
-        }
-        printf("\n");
-        double sum = 0.0;
-        int imax = 0;
-        double vmax = -INFINITY;
-        for (int i = 0; i < 32000; i++) {
-            sum += (double) logits[i];
-            if (logits[i] > vmax) {
-                vmax = logits[i];
-                imax = i;
-            }
-        }
-        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+        ggml_graph_export(&gf, cgraph_fname);
     }
 
 #ifdef GGML_PERF

From a7fb899c53013f6a9b776a073f93272f6954805b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 17:57:02 +0300
Subject: [PATCH 44/49] metal : final refactoring and simplification

---
 .gitignore                                |   2 +-
 CMakeLists.txt                            |   6 +-
 Makefile                                  |  33 ++-
 examples/CMakeLists.txt                   |   2 +-
 examples/metal/CMakeLists.txt             |   3 +
 examples/{mtl/mtl.cpp => metal/metal.cpp} |  18 +-
 examples/mtl/CMakeLists.txt               |   6 -
 ggml-metal.h                              |  63 +++++
 ggml-mtl.m => ggml-metal.m                | 282 +++++++++-------------
 ggml-mtl.metal => ggml-metal.metal        |  13 -
 ggml-mtl.h                                |  44 ----
 llama.cpp                                 |  57 +++--
 12 files changed, 253 insertions(+), 276 deletions(-)
 create mode 100644 examples/metal/CMakeLists.txt
 rename examples/{mtl/mtl.cpp => metal/metal.cpp} (77%)
 delete mode 100644 examples/mtl/CMakeLists.txt
 create mode 100644 ggml-metal.h
 rename ggml-mtl.m => ggml-metal.m (70%)
 rename ggml-mtl.metal => ggml-metal.metal (97%)
 delete mode 100644 ggml-mtl.h

diff --git a/.gitignore b/.gitignore
index edcb6b1443319..e4561ad7344c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,7 +17,7 @@ build-release/
 build-static/
 build-cublas/
 build-opencl/
-build-mtl/
+build-metal/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62f1467aa03e1..1f2e78c0ffba4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,7 +207,7 @@ if (LLAMA_METAL)
     find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
     find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
 
-    set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
 
     add_compile_definitions(GGML_USE_METAL)
     add_compile_definitions(GGML_METAL_NDEBUG)
@@ -215,8 +215,8 @@ if (LLAMA_METAL)
     # get full path to the file
     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
 
-    # copy ggml-mtl.metal to bin directory
-    configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
 
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
         ${FOUNDATION_LIBRARY}
diff --git a/Makefile b/Makefile
index 8e8d426c5d6bf..1f910c3ec8629 100644
--- a/Makefile
+++ b/Makefile
@@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	#CFLAGS   += -mfma -mf16c -mavx
 	#CXXFLAGS += -mfma -mf16c -mavx
 endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -116,6 +117,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
@@ -123,7 +125,8 @@ ifndef LLAMA_NO_ACCELERATE
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
-endif
+endif # LLAMA_NO_ACCELERATE
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
@@ -131,11 +134,13 @@ ifdef LLAMA_OPENBLAS
 	else
 		LDFLAGS += -lopenblas
 	endif
-endif
+endif # LLAMA_OPENBLAS
+
 ifdef LLAMA_BLIS
 	CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	LDFLAGS += -lblis -L/usr/local/lib
-endif
+endif # LLAMA_BLIS
+
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@@ -156,9 +161,10 @@ endif # LLAMA_CUDA_DMMV_Y
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
+
 ifdef LLAMA_CLBLAST
-	CFLAGS  += -DGGML_USE_CLBLAST
-	CXXFLAGS  += -DGGML_USE_CLBLAST
+	CFLAGS   += -DGGML_USE_CLBLAST
+	CXXFLAGS += -DGGML_USE_CLBLAST
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
@@ -166,23 +172,38 @@ ifdef LLAMA_CLBLAST
 		LDFLAGS += -lclblast -lOpenCL
 	endif
 	OBJS    += ggml-opencl.o
+
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif
+endif # LLAMA_CLBLAST
+
+ifdef LLAMA_METAL
+	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+	CXXFLAGS += -DGGML_USE_METAL
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	OBJS     += ggml-metal.o
+
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_METAL
+
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
+
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
+
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
+
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e23bf1cb3d9f2..3deff4077f80e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -38,7 +38,7 @@ else()
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
     if (LLAMA_METAL)
-        add_subdirectory(mtl)
+        add_subdirectory(metal)
     endif()
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt
new file mode 100644
index 0000000000000..a8c4284a53642
--- /dev/null
+++ b/examples/metal/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(TEST_TARGET metal)
+add_executable(${TEST_TARGET} metal.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
diff --git a/examples/mtl/mtl.cpp b/examples/metal/metal.cpp
similarity index 77%
rename from examples/mtl/mtl.cpp
rename to examples/metal/metal.cpp
index 56510904c32df..fc1db90a12d80 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/metal/metal.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "ggml-mtl.h"
+#include "ggml-metal.h"
 
 #include <cstdio>
 #include <cstring>
@@ -23,20 +23,20 @@ int main(int argc, char ** argv) {
     gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = ggml_mtl_init();
+    auto * ctx_metal = ggml_metal_init();
 
-    ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
-    ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
+    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
+    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
     // main
     {
         struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
         *(int32_t *) input->data = 1; // BOS
 
-        ggml_mtl_set_tensor(ctx_mtl, input);
+        ggml_metal_set_tensor(ctx_metal, input);
 
         // warmup
-        ggml_mtl_graph_compute(ctx_mtl, &gf);
+        ggml_metal_graph_compute(ctx_metal, &gf);
 
         const int n_iter = 16;
 
@@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            ggml_mtl_graph_compute(ctx_mtl, &gf);
+            ggml_metal_graph_compute(ctx_metal, &gf);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -55,7 +55,7 @@ int main(int argc, char ** argv) {
     // debug output
     {
         struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
-        ggml_mtl_get_tensor(ctx_mtl, logits);
+        ggml_metal_get_tensor(ctx_metal, logits);
 
         float * ptr = (float *) ggml_get_data(logits);
 
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
         printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
     }
 
-    ggml_mtl_free(ctx_mtl);
+    ggml_metal_free(ctx_metal);
 
     ggml_free(ctx_data);
     ggml_free(ctx_eval);
diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
deleted file mode 100644
index 0fe3a7197faf8..0000000000000
--- a/examples/mtl/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-if (APPLE)
-    set(TEST_TARGET mtl)
-    add_executable(${TEST_TARGET} mtl.cpp)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-endif()
-
diff --git a/ggml-metal.h b/ggml-metal.h
new file mode 100644
index 0000000000000..a9441a9d46eac
--- /dev/null
+++ b/ggml-metal.h
@@ -0,0 +1,63 @@
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_metal_context;
+
+struct ggml_metal_context * ggml_metal_init(void);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// same as ggml_graph_compute but uses Metal
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/ggml-mtl.m b/ggml-metal.m
similarity index 70%
rename from ggml-mtl.m
rename to ggml-metal.m
index 8f831afe7f13a..3cb423a01f550 100644
--- a/ggml-mtl.m
+++ b/ggml-metal.m
@@ -1,28 +1,30 @@
-#import "ggml-mtl.h"
+#import "ggml-metal.h"
 
 #import "ggml.h"
 
 #import <Foundation/Foundation.h>
+
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
 #ifdef GGML_METAL_NDEBUG
-#define mtl_printf(...)
+#define metal_printf(...)
 #else
-#define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
+#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
 #endif
-//#define mtl_printf(...)
 
-struct ggml_mtl_buffer {
+#define UNUSED(x) (void)(x)
+
+struct ggml_metal_buffer {
     const char * name;
 
     void   * data;
     size_t   size;
 
-    id<MTLBuffer> mtl;
+    id<MTLBuffer> metal;
 };
 
-struct ggml_mtl_context {
+struct ggml_metal_context {
     float * logits;
 
     id<MTLDevice>       device;
@@ -30,65 +32,41 @@
     id<MTLLibrary>      library;
 
     int n_buffers;
-    struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS];
+    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
     // custom kernels
-    id<MTLFunction>             function_add;
-    id<MTLComputePipelineState> pipeline_add;
-
-    id<MTLFunction>             function_mul;
-    id<MTLComputePipelineState> pipeline_mul;
-
-    // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
-    id<MTLFunction>             function_mul_row;
-    id<MTLComputePipelineState> pipeline_mul_row;
-
-    id<MTLFunction>             function_scale;
-    id<MTLComputePipelineState> pipeline_scale;
-
-    id<MTLFunction>             function_silu;
-    id<MTLComputePipelineState> pipeline_silu;
-
-    id<MTLFunction>             function_relu;
-    id<MTLComputePipelineState> pipeline_relu;
-
-    id<MTLFunction>             function_soft_max;
-    id<MTLComputePipelineState> pipeline_soft_max;
-
-    id<MTLFunction>             function_diag_mask_inf;
-    id<MTLComputePipelineState> pipeline_diag_mask_inf;
-
-    id<MTLFunction>             function_get_rows_q4_0;
-    id<MTLComputePipelineState> pipeline_get_rows_q4_0;
-
-    id<MTLFunction>             function_rms_norm;
-    id<MTLComputePipelineState> pipeline_rms_norm;
-
-    id<MTLFunction>             function_mul_mat_q4_0_f32;
-    id<MTLComputePipelineState> pipeline_mul_mat_q4_0_f32;
-
-    id<MTLFunction>             function_mul_mat_f16_f32;
-    id<MTLComputePipelineState> pipeline_mul_mat_f16_f32;
-
-    id<MTLFunction>             function_rope;
-    id<MTLComputePipelineState> pipeline_rope;
-
-    id<MTLFunction>             function_cpy_f32_f16;
-    id<MTLComputePipelineState> pipeline_cpy_f32_f16;
-
-    id<MTLFunction>             function_cpy_f32_f32;
-    id<MTLComputePipelineState> pipeline_cpy_f32_f32;
+#define GGML_METAL_DECL_KERNEL(name) \
+    id<MTLFunction>             function_##name; \
+    id<MTLComputePipelineState> pipeline_##name
+
+    GGML_METAL_DECL_KERNEL(add);
+    GGML_METAL_DECL_KERNEL(mul);
+    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+    GGML_METAL_DECL_KERNEL(scale);
+    GGML_METAL_DECL_KERNEL(silu);
+    GGML_METAL_DECL_KERNEL(relu);
+    GGML_METAL_DECL_KERNEL(soft_max);
+    GGML_METAL_DECL_KERNEL(diag_mask_inf);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DECL_KERNEL(rms_norm);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DECL_KERNEL(rope);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
+
+#undef GGML_METAL_DECL_KERNEL
 };
 
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-NSString * const msl_library_source = @"see mtl.metal";
+static NSString * const msl_library_source = @"see metal.metal";
 
-struct ggml_mtl_context * ggml_mtl_init(void) {
+struct ggml_metal_context * ggml_metal_init(void) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
-    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
@@ -113,12 +91,14 @@
         }
     }
 #else
-    // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource
+    UNUSED(msl_library_source);
+
+    // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
     {
         NSError * error = nil;
 
-        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];
         fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
 
         NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
@@ -137,80 +117,44 @@
 
     // load kernels
     {
-        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
-
-        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
-        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
-
-        ctx->function_mul = [ctx->library newFunctionWithName:@"kernel_mul"];
-        ctx->pipeline_mul = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul error:nil];
-        fprintf(stderr, "%s: loaded kernel_mul: %p\n", __func__, (void *) ctx->pipeline_mul);
-
-        ctx->function_mul_row = [ctx->library newFunctionWithName:@"kernel_mul_row"];
-        ctx->pipeline_mul_row = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_row error:nil];
-        fprintf(stderr, "%s: loaded kernel_mul_row: %p\n", __func__, (void *) ctx->pipeline_mul_row);
-
-        ctx->function_scale = [ctx->library newFunctionWithName:@"kernel_scale"];
-        ctx->pipeline_scale = [ctx->device newComputePipelineStateWithFunction:ctx->function_scale error:nil];
-        fprintf(stderr, "%s: loaded kernel_scale: %p\n", __func__, (void *) ctx->pipeline_scale);
-
-        ctx->function_silu = [ctx->library newFunctionWithName:@"kernel_silu"];
-        ctx->pipeline_silu = [ctx->device newComputePipelineStateWithFunction:ctx->function_silu error:nil];
-        fprintf(stderr, "%s: loaded kernel_silu: %p\n", __func__, (void *) ctx->pipeline_silu);
-
-        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
-        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
-
-        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
-        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
-
-        ctx->function_diag_mask_inf = [ctx->library newFunctionWithName:@"kernel_diag_mask_inf" constantValues:constants error:nil];
-        ctx->pipeline_diag_mask_inf = [ctx->device newComputePipelineStateWithFunction:ctx->function_diag_mask_inf error:nil];
-        fprintf(stderr, "%s: loaded kernel_diag_mask_inf: %p\n", __func__, (void *) ctx->pipeline_diag_mask_inf);
-
-        ctx->function_get_rows_q4_0 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_0"];
-        ctx->pipeline_get_rows_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_0 error:nil];
-        fprintf(stderr, "%s: loaded kernel_get_rows_q4_0: %p\n", __func__, (void *) ctx->pipeline_get_rows_q4_0);
-
-        ctx->function_rms_norm = [ctx->library newFunctionWithName:@"kernel_rms_norm"];
-        ctx->pipeline_rms_norm = [ctx->device newComputePipelineStateWithFunction:ctx->function_rms_norm error:nil];
-        fprintf(stderr, "%s: loaded kernel_rms_norm: %p\n", __func__, (void *) ctx->pipeline_rms_norm);
-
-        ctx->function_mul_mat_q4_0_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0_f32"];
-        ctx->pipeline_mul_mat_q4_0_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0_f32 error:nil];
-        fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0_f32);
-
-        ctx->function_mul_mat_f16_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_f16_f32"];
-        ctx->pipeline_mul_mat_f16_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_f16_f32 error:nil];
-        fprintf(stderr, "%s: loaded kernel_mul_mat_f16_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_f16_f32);
-
-        ctx->function_rope = [ctx->library newFunctionWithName:@"kernel_rope"];
-        ctx->pipeline_rope = [ctx->device newComputePipelineStateWithFunction:ctx->function_rope error:nil];
-        fprintf(stderr, "%s: loaded kernel_rope: %p\n", __func__, (void *) ctx->pipeline_rope);
-
-        ctx->function_cpy_f32_f16 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f16"];
-        ctx->pipeline_cpy_f32_f16 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f16 error:nil];
-        fprintf(stderr, "%s: loaded kernel_cpy_f32_f16: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f16);
-
-        ctx->function_cpy_f32_f32 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f32"];
-        ctx->pipeline_cpy_f32_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f32 error:nil];
-        fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+
+        GGML_METAL_ADD_KERNEL(add);
+        GGML_METAL_ADD_KERNEL(mul);
+        GGML_METAL_ADD_KERNEL(mul_row);
+        GGML_METAL_ADD_KERNEL(scale);
+        GGML_METAL_ADD_KERNEL(silu);
+        GGML_METAL_ADD_KERNEL(relu);
+        GGML_METAL_ADD_KERNEL(soft_max);
+        GGML_METAL_ADD_KERNEL(diag_mask_inf);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
+        GGML_METAL_ADD_KERNEL(rms_norm);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
+        GGML_METAL_ADD_KERNEL(rope);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
+
+#undef GGML_METAL_ADD_KERNEL
     }
 
     return ctx;
 }
 
-void ggml_mtl_free(struct ggml_mtl_context * ctx) {
+void ggml_metal_free(struct ggml_metal_context * ctx) {
     fprintf(stderr, "%s: deallocating\n", __func__);
 
     free(ctx);
 }
 
-// get data / eval buffer + offset
-static id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+// finds the Metal buffer that contains the tensor data on the GPU device
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// Metal buffer based on the host memory pointer
+//
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
     //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
     for (int i = 0; i < ctx->n_buffers; ++i) {
@@ -221,64 +165,75 @@ void ggml_mtl_free(struct ggml_mtl_context * ctx) {
 
             //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
 
-            return ctx->buffers[i].mtl;
+            return ctx->buffers[i].metal;
         }
     }
 
     fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-    GGML_ASSERT(false);
 
     return nil;
 }
 
-void ggml_mtl_add_buffer(
-        struct ggml_mtl_context * ctx,
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
                      const char * name,
                            void * data,
                          size_t   size) {
     if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
         fprintf(stderr, "%s: too many buffers\n", __func__);
-        return;
+        return false;
     }
 
     if (data) {
+        // verify that the buffer does not overlap with any of the existing buffers
+        for (int i = 0; i < ctx->n_buffers; ++i) {
+            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
+
+            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+                fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                return false;
+            }
+        }
+
         ctx->buffers[ctx->n_buffers].name = name;
         ctx->buffers[ctx->n_buffers].data = data;
         ctx->buffers[ctx->n_buffers].size = size;
-        ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
+        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
 
         ++ctx->n_buffers;
 
-        fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
+        fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
     }
+
+    return true;
 }
 
-void ggml_mtl_set_tensor(
-        struct ggml_mtl_context * ctx,
+void ggml_metal_set_tensor(
+        struct ggml_metal_context * ctx,
         struct ggml_tensor * t) {
-    mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name);
+    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
 
     size_t offs;
-    id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, t, &offs);
+    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
 
     memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
 }
 
-void ggml_mtl_get_tensor(
-        struct ggml_mtl_context * ctx,
+void ggml_metal_get_tensor(
+        struct ggml_metal_context * ctx,
         struct ggml_tensor * t) {
-    mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
+    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
 
     size_t offs;
-    id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, t, &offs);
+    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
 
     memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }
 
-int ggml_mtl_graph_compute(
-        struct ggml_mtl_context * ctx,
+void ggml_metal_graph_compute(
+        struct ggml_metal_context * ctx,
              struct ggml_cgraph * gf) {
-    mtl_printf("%s: evaluating graph\n", __func__);
+    metal_printf("%s: evaluating graph\n", __func__);
 
     size_t offs_src0 = 0;
     size_t offs_src1 = 0;
@@ -288,7 +243,7 @@ int ggml_mtl_graph_compute(
     id<MTLComputeCommandEncoder> encoder = nil;
 
     for (int i = 0; i < gf->n_nodes; ++i) {
-        //mtl_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+        //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
         struct ggml_tensor * src0 = gf->nodes[i]->src0;
         struct ggml_tensor * src1 = gf->nodes[i]->src1;
@@ -307,12 +262,12 @@ int ggml_mtl_graph_compute(
         const int64_t  ne10 = src1 ? src1->ne[0] : 0;
         const int64_t  ne11 = src1 ? src1->ne[1] : 0;
         const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-        //const int64_t  ne13 = src1 ? src1->ne[3] : 0;
+        const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
 
         const uint64_t nb10 = src1 ? src1->nb[0] : 0;
         const uint64_t nb11 = src1 ? src1->nb[1] : 0;
         const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-        //const uint64_t nb13 = src1 ? src1->nb[3] : 0;
+        const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
 
         const int64_t  ne0  = dst ? dst->ne[0] : 0;
         const int64_t  ne1  = dst ? dst->ne[1] : 0;
@@ -328,21 +283,21 @@ int ggml_mtl_graph_compute(
         const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
         const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
 
-        id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
-        id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
-        id<MTLBuffer> id_dst  = dst  ? ggml_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
+        id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+        id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+        id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
 
-        //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+        //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
         //if (src0) {
-        //    mtl_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+        //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
         //            ggml_is_contiguous(src0), src0->name);
         //}
         //if (src1) {
-        //    mtl_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+        //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
         //            ggml_is_contiguous(src1), src1->name);
         //}
         //if (dst) {
-        //    mtl_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+        //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
         //            dst->name);
         //}
 
@@ -472,6 +427,8 @@ int ggml_mtl_graph_compute(
                 } break;
             case GGML_OP_MUL_MAT:
                 {
+                    // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
+
                     GGML_ASSERT(ne00 == ne10);
                     GGML_ASSERT(ne02 == ne12);
 
@@ -503,6 +460,7 @@ int ggml_mtl_graph_compute(
 
                         // we need to do ne02 multiplications
                         // TODO: is there a way to do this in parallel - currently very slow ..
+                        // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
                         for (int64_t i02 = 0; i02 < ne02; ++i02) {
                             size_t offs_src0_cur = offs_src0 + i02*nb02;
                             size_t offs_src1_cur = offs_src1 + i02*nb12;
@@ -578,10 +536,7 @@ int ggml_mtl_graph_compute(
 
                     switch (src0->type) {
                         case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
-                        default: {
-                                     // not implemented
-                                     fprintf(stderr, "%s: node %3d, op = %8s, type = %8s not implemented\n", __func__, i, ggml_op_name(dst->op), ggml_type_name(src0->type));
-                                 }
+                        default: GGML_ASSERT(false && "not implemented");
                     }
 
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -626,10 +581,6 @@ int ggml_mtl_graph_compute(
                     const int n_dims = ((int32_t *) src1->data)[1];
                     const int mode   = ((int32_t *) src1->data)[2];
 
-                    //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
-
                     const int n_past = ((int32_t *)(src1->data))[0];
 
                     [encoder setComputePipelineState:ctx->pipeline_rope];
@@ -665,12 +616,6 @@ int ggml_mtl_graph_compute(
 
                     const int nth = 32;
 
-                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03);
-                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03);
-                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
-                    //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb0,  nb1,  nb2,  nb3);
-                    //mtl_printf("cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt));
-
                     switch (src0t) {
                         case GGML_TYPE_F32:
                             {
@@ -707,7 +652,6 @@ int ggml_mtl_graph_compute(
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                 GGML_ASSERT(false);
-                return -1;
         }
     }
 
@@ -721,8 +665,8 @@ int ggml_mtl_graph_compute(
 
     {
         const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
-    }
+        UNUSED(time_elapsed);
 
-    return 0;
+        metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
+    }
 }
diff --git a/ggml-mtl.metal b/ggml-metal.metal
similarity index 97%
rename from ggml-mtl.metal
rename to ggml-metal.metal
index 53f7f7448b14f..4bedc8ea45e57 100644
--- a/ggml-mtl.metal
+++ b/ggml-metal.metal
@@ -98,19 +98,6 @@ kernel void kernel_soft_max(
     device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
     device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
-    //float max = 0.0f;
-    //for (int i = 0; i < ne00; i++) {
-    //    max = MAX(max, psrc0[i]);
-    //}
-    //float sum = 0.0f;
-    //for (int i = 0; i < ne00; i++) {
-    //    pdst[i] = exp(psrc0[i] - max);
-    //    sum += pdst[i];
-    //}
-    //for (int i = 0; i < ne00; i++) {
-    //    pdst[i] /= sum;
-    //}
-
     // parallel max
     buf[tpitg[0]] = -INFINITY;
     for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
diff --git a/ggml-mtl.h b/ggml-mtl.h
deleted file mode 100644
index cab71e3862290..0000000000000
--- a/ggml-mtl.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include <stddef.h>
-
-#define GGML_METAL_MAX_BUFFERS 16
-
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_mtl_context;
-
-struct ggml_mtl_context * ggml_mtl_init(void);
-
-void ggml_mtl_free(struct ggml_mtl_context * ctx);
-
-void ggml_mtl_add_buffer(
-        struct ggml_mtl_context * ctx,
-                     const char * name,
-                           void * data,
-                         size_t   size);
-
-// set data from host memory into the device
-void ggml_mtl_set_tensor(
-        struct ggml_mtl_context * ctx,
-             struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_mtl_get_tensor(
-        struct ggml_mtl_context * ctx,
-             struct ggml_tensor * t);
-
-// return 0 on success
-int ggml_mtl_graph_compute(
-        struct ggml_mtl_context * ctx,
-             struct ggml_cgraph * gf);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/llama.cpp b/llama.cpp
index 455402a4e0b7a..4b22b215a5dc8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -17,7 +17,7 @@
 #endif
 
 #ifdef GGML_USE_METAL
-#include "ggml-mtl.h"
+#include "ggml-metal.h"
 #endif
 
 #include <array>
@@ -243,7 +243,7 @@ struct llama_context {
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
 #ifdef GGML_USE_METAL
-    ggml_mtl_context * mtl_ctx = NULL;
+    ggml_metal_context * ctx_metal = NULL;
 #endif
 
     int    buf_last = 0;
@@ -1256,8 +1256,8 @@ static bool llama_eval_internal(
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
 #ifdef GGML_USE_METAL
-    if (lctx.mtl_ctx) {
-        ggml_mtl_set_tensor(lctx.mtl_ctx, embd);
+    if (lctx.ctx_metal) {
+        ggml_metal_set_tensor(lctx.ctx_metal, embd);
     }
 #endif
 
@@ -1448,11 +1448,25 @@ static bool llama_eval_internal(
     ggml_build_forward_expand(&gf, cur);
 
 #ifdef GGML_USE_METAL
-    if (lctx.mtl_ctx) {
-        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf);
-        ggml_mtl_get_tensor(lctx.mtl_ctx, cur);
+    if (lctx.ctx_metal && N == 1) {
+        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
+        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
     } else {
+        // IMPORTANT:
+        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
+        // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
+        // coprocessor.
+        //
+        // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
+        // But for now, we have focused only on Matrix x Vector Metal multiplication.
+        //
         ggml_graph_compute(ctx0, &gf);
+
+        if (lctx.ctx_metal) {
+            // We need to sync the CPU KV cache with the GPU KV cache
+            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
+            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
+        }
     }
 #else
     ggml_graph_compute(ctx0, &gf);
@@ -2318,12 +2332,7 @@ struct llama_context * llama_init_from_file(
             ctx->embedding.resize(hparams.n_embd);
         }
 
-#ifdef GGML_USE_METAL
-        // when using Metal, we don't need the extra buffer for intermediate dequantization
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100);
-#else
         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
-#endif
 
         ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@@ -2333,19 +2342,19 @@ struct llama_context * llama_init_from_file(
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
         if (params.use_mmap) {
-            ctx->mtl_ctx = ggml_mtl_init();
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr,    ctx->model.mapping->size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
+            ctx->ctx_metal = ggml_metal_init();
+            ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr,    ctx->model.mapping->size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
         } else {
-            ctx->mtl_ctx = ggml_mtl_init();
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,               ctx->buf_compute.size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr,         ctx->model.kv_self.buf.size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,            ctx->buf_scratch[0].size);
-            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,            ctx->buf_scratch[1].size);
+            ctx->ctx_metal = ggml_metal_init();
+            ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
+            ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,               ctx->buf_compute.size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr,         ctx->model.kv_self.buf.size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,            ctx->buf_scratch[0].size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,            ctx->buf_scratch[1].size);
         }
     }
 #endif

From d8a7486d179a4b673a4f3074dd5ea47395e3f2b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 17:58:23 +0300
Subject: [PATCH 45/49] Revert "ci : disable temporary"

This reverts commit 98c267fc77fe811082f672538fc91bcfc9072d63.
---
 .github/workflows/editorconfig.yml | 17 +++++++++++++++++
 .github/workflows/tidy-post.yml    | 20 ++++++++++++++++++++
 .github/workflows/tidy-review.yml  | 23 +++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 .github/workflows/editorconfig.yml
 create mode 100644 .github/workflows/tidy-post.yml
 create mode 100644 .github/workflows/tidy-review.yml

diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
new file mode 100644
index 0000000000000..b4e535acf1f64
--- /dev/null
+++ b/.github/workflows/editorconfig.yml
@@ -0,0 +1,17 @@
+name: EditorConfig Checker
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  editorconfig:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - run: editorconfig-checker
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
new file mode 100644
index 0000000000000..a58da0cd6493d
--- /dev/null
+++ b/.github/workflows/tidy-post.yml
@@ -0,0 +1,20 @@
+name: clang-tidy review post comments
+
+on:
+  workflow_run:
+    workflows: ["clang-tidy-review"]
+    types:
+      - completed
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: ZedThree/clang-tidy-review/post@v0.13.0
+        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
+        with:
+          # adjust options as necessary
+          lgtm_comment_body: ''
+          annotations: false
+          max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
new file mode 100644
index 0000000000000..a4bc8d976560e
--- /dev/null
+++ b/.github/workflows/tidy-review.yml
@@ -0,0 +1,23 @@
+name: clang-tidy-review
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  clang-tidy-review:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: ZedThree/clang-tidy-review@v0.13.0
+      id: review
+      with:
+        lgtm_comment_body: ''
+        build_dir: build
+        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
+        split_workflow: true
+
+    - uses: ZedThree/clang-tidy-review/upload@v0.13.0

From b252acbcb6697c9321351c6ac86f991dc80d0747 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 18:10:28 +0300
Subject: [PATCH 46/49] metal : add comments

---
 examples/metal/metal.cpp | 15 +++++++++++++++
 ggml.c                   |  1 -
 llama.cpp                |  4 ----
 llama.h                  |  6 ++++--
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index fc1db90a12d80..10b35faf8ca50 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -1,3 +1,18 @@
+// Evaluate a statically export ggml computation graph with Metal
+//
+// - First, export a LLaMA graph:
+//
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//
+// - Run this tool to evaluate the exported graph:
+//
+//  $ ./bin/metal llama.ggml
+//
+// The purpose of this tool is mostly for debugging and demonstration purposes.
+// The main limitation of exporting computation graphs is that their sizes are static which often
+// can be a problem for real-world applications.
+//
+
 #include "ggml.h"
 #include "ggml-metal.h"
 
diff --git a/ggml.c b/ggml.c
index b5e6997dd2d71..27a9de2beefe5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14869,7 +14869,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
     // read file into data
     {
         FILE * fin = fopen(fname, "rb");
-
         if (!fin) {
             fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
             return result;
diff --git a/llama.cpp b/llama.cpp
index 4b22b215a5dc8..471b996aa090b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2990,10 +2990,6 @@ int llama_eval(
 }
 
 int llama_eval_export(struct llama_context * ctx, const char * fname) {
-    // these values determine the maximum inference sizes of the exported computation graph
-    // TODO: need to increase buffers to support the full context
-    //const int n_ctx   = ctx->model.hparams.n_ctx;
-    //const int n_batch = 512;
     const int n_batch = 1;
     const int n_ctx   = 512 - n_batch;
 
diff --git a/llama.h b/llama.h
index a650ddf4501e8..87fa9736784c8 100644
--- a/llama.h
+++ b/llama.h
@@ -173,8 +173,10 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
 
-    // Export a computation graph for model inference
-    // TODO: very likely to change
+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
     LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
 
     // Convert the provided text into tokens.

From db3db9e7749c4b7681c96272c87fdbf6b1e235e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 18:19:08 +0300
Subject: [PATCH 47/49] metal : clean-up stuff, fix typos

---
 examples/metal/metal.cpp |  2 +-
 ggml.c                   |  4 +---
 llama.cpp                | 37 ++++++++++++-------------------------
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index 10b35faf8ca50..77aca94a3ec97 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -1,4 +1,4 @@
-// Evaluate a statically export ggml computation graph with Metal
+// Evaluate a statically exported ggml computation graph with Metal
 //
 // - First, export a LLaMA graph:
 //
diff --git a/ggml.c b/ggml.c
index 27a9de2beefe5..42e8626dfefa0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15049,7 +15049,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
 
                 // create the tensor
                 // "view" operations are handled differently
-                // TODO: handle inplac ops - currentl a copy is always made
+                // TODO: handle inplace ops - currently a copy is always made
 
                 struct ggml_tensor * tensor = NULL;
 
@@ -15084,10 +15084,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         } break;
                 }
 
-
                 memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
 
-                // TODO: double-check this is needed
                 for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                     tensor->nb[j] = nb[j];
                 }
diff --git a/llama.cpp b/llama.cpp
index 471b996aa090b..9efc60cdbd309 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1201,7 +1201,7 @@ static bool llama_model_load(
 //   - tokens:       new batch of tokens to process
 //   - n_past:       the context size so far
 //   - n_threads:    number of threads to use
-//   - cgraph_fname: filename of the exported computation graph (TODO: TMP!!!)
+//   - cgraph_fname: filename of the exported computation graph
 //
 static bool llama_eval_internal(
         llama_context &  lctx,
@@ -1256,7 +1256,7 @@ static bool llama_eval_internal(
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
 #ifdef GGML_USE_METAL
-    if (lctx.ctx_metal) {
+    if (lctx.ctx_metal && N == 1) {
         ggml_metal_set_tensor(lctx.ctx_metal, embd);
     }
 #endif
@@ -1279,18 +1279,10 @@ static bool llama_eval_internal(
 
         // self-attention
         {
-            //auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
-
             // compute Q and K and RoPE them
 
-            struct ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
-            struct ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
-            ggml_set_name(Qpre, "Qpre");
-            ggml_set_name(Kpre, "Kpre");
-
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, Qpre, n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, Kpre, n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
             ggml_set_name(Qcur, "Qcur");
             ggml_set_name(Kcur, "Kcur");
 
@@ -1305,9 +1297,6 @@ static bool llama_eval_internal(
                         (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
-                ggml_set_name(k, "k");
-                ggml_set_name(v, "v");
-
                 // important: storing RoPE-ed version of K in the KV cache!
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@@ -2341,21 +2330,19 @@ struct llama_context * llama_init_from_file(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
+        ctx->ctx_metal = ggml_metal_init();
+
         if (params.use_mmap) {
-            ctx->ctx_metal = ggml_metal_init();
-            ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr,    ctx->model.mapping->size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
+            ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,    ctx->buf_compute.size);
         } else {
-            ctx->ctx_metal = ggml_metal_init();
             ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
             ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,               ctx->buf_compute.size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr,         ctx->model.kv_self.buf.size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,            ctx->buf_scratch[0].size);
-            ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,            ctx->buf_scratch[1].size);
         }
+
+        ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
+        ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
+        ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
     }
 #endif
 

From e33002d42ef97f09d70296a504f122a5ee629e6c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 18:48:35 +0300
Subject: [PATCH 48/49] readme : add Metal instructions

---
 README.md | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 00571d8e168d9..4628e0cb980fa 100644
--- a/README.md
+++ b/README.md
@@ -51,11 +51,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
 
 - Plain C/C++ implementation without dependencies
-- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
+- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 4-bit, 5-bit and 8-bit integer quantization support
-- Runs on the CPU
 - Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
 - cuBLAS and CLBlast support
 
@@ -236,6 +235,28 @@ In order to build llama.cpp you have three different options.
     zig build -Drelease-fast
     ```
 
+### Metal Build
+
+Using Metal allows the computation to be executed on the GPU for Apple devices:
+
+- Using `make`:
+
+  ```bash
+  LLAMA_METAL=1 make
+  ```
+
+- Using `CMake`:
+
+    ```bash
+    mkdir build-metal
+    cd build-metal
+    cmake -DLLAMA_METAL=ON ..
+    cmake --build . --config Release
+    ```
+
+When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
+Any value larger than 0 will offload the computation to the GPU.
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
@@ -367,7 +388,7 @@ Building the program with BLAS support may lead to some performance improvements
 
   Running:
 
-  The CLBlast build supports `--gpu-layers|-ngl` like  the CUDA version does.
+  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
 
   To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
   The selection can be a number (starting from 0) or a text string to search:

From 324e823afda77da0caf03203f793d115b3927189 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 4 Jun 2023 18:50:09 +0300
Subject: [PATCH 49/49] readme : add example for main

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4628e0cb980fa..5b71968fafac7 100644
--- a/README.md
+++ b/README.md
@@ -255,7 +255,11 @@ Using Metal allows the computation to be executed on the GPU for Apple devices:
     ```
 
 When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
-Any value larger than 0 will offload the computation to the GPU.
+Any value larger than 0 will offload the computation to the GPU. For example:
+
+```bash
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+```
 
 ### BLAS Build