ggerganov · ggerganov · Jun 4, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -37,6 +37,7 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
+    add_subdirectory(mtl)
     if(LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -299,6 +299,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mmap = false;
         } else if (arg == "--mtest") {
             params.mem_test = true;
+        } else if (arg == "--export") {
+            params.export_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -438,6 +440,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        number of layers to store in VRAM\n");
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
+    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");

diff --git a/examples/common.h b/examples/common.h
@@ -71,6 +71,7 @@ struct gpt_params {
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
+    bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -134,6 +134,13 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+
+        return 0;
+    }
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;

diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
@@ -0,0 +1,33 @@
+if (APPLE)
+    #
+    # mtl
+
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(TEST_TARGET mtl)
+    add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
+    target_link_libraries(${TEST_TARGET} PRIVATE
+        ggml
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+    )
+
+    # TODO: temporary until the kernels are ready
+    # custom command to build mtl.metal into a library
+    # depends on the mtl.metal file
+    add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
+
+    add_custom_command(
+        OUTPUT  ${CMAKE_BINARY_DIR}/mtl.metallib
+        COMMAND xcrun -sdk macosx metal    -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
+        COMMAND xcrun -sdk macosx metallib            ${CMAKE_BINARY_DIR}/mtl.air   -o ${CMAKE_BINARY_DIR}/mtl.metallib
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
+        COMMENT "Building mtl.metallib"
+    )
+endif()
+
diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
@@ -0,0 +1,51 @@
+#include "ggml.h"
+#include "mtl.h"
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
+        return -1;
+    }
+
+    const char * fname_cgraph = argv[1];
+
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = 1;
+
+    // allocate work context
+    static size_t buf_size = gf.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    // this allocates all Metal resources and memory buffers
+    auto * ctx_mtl = llama_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+    // the actual inference happens here
+    llama_mtl_eval(ctx_mtl, &gf);
+
+    llama_mtl_free(ctx_mtl);
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return 0;
+}
+
diff --git a/examples/mtl/mtl.h b/examples/mtl/mtl.h
@@ -0,0 +1,28 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * llama_mtl_init(
+        struct ggml_context * ctx_data,
+        struct ggml_context * ctx_eval,
+        struct ggml_context * ctx_work,
+        struct ggml_cgraph  * gf);
+
+void llama_mtl_free(struct ggml_mtl_context * ctx);
+
+// return 0 on success
+int llama_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf);
+
+#ifdef __cplusplus
+}
+#endif
+