feat: sync llama.cpp

mybigday · Jan 19, 2024 · abfb6dc · abfb6dc
1 parent 08d0a13
commit abfb6dc
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 19 deletions.
diff --git a/cpp/common.cpp b/cpp/common.cpp
@@ -687,6 +687,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.hellaswag_tasks = std::stoi(argv[i]);
+        } else if (arg == "--winogrande") {
+            params.winogrande = true;
+        } else if (arg == "--winogrande-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.winogrande_tasks = std::stoi(argv[i]);
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -932,6 +940,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
+    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

diff --git a/cpp/common.h b/cpp/common.h
@@ -94,6 +94,9 @@ struct gpt_params {
     bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
+    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs

diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m
@@ -238,21 +238,19 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format,
 static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
     LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
-    id<MTLDevice> device;
-    NSString * s;
-
-#if TARGET_OS_OSX
+#if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
     // Show all the Metal device instances in the system
     NSArray * devices = MTLCopyAllDevices();
-    for (device in devices) {
-        s = [device name];
+    for (id<MTLDevice> device in devices) {
+        NSString * s = [device name];
         LM_GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
     }
+    [devices release]; // since it was created by a *Copy* C method
 #endif
 
     // Pick and show default Metal device
-    device = MTLCreateSystemDefaultDevice();
-    s = [device name];
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    NSString * s = [device name];
     LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
 
     // Configure context
@@ -712,7 +710,6 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
 static bool lm_ggml_metal_graph_compute(
         struct lm_ggml_metal_context * ctx,
                struct lm_ggml_cgraph * gf) {
-    @autoreleasepool {
 
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
     edesc.dispatchType = MTLDispatchTypeSerial;
@@ -2255,7 +2252,6 @@ static bool lm_ggml_metal_graph_compute(
     }
 
     return true;
-    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////

diff --git a/cpp/llama.cpp b/cpp/llama.cpp
@@ -1610,7 +1610,7 @@ struct llama_model {
     std::unique_ptr<llama_mmap> mapping;
 
     // objects representing data potentially being locked in memory
-    llama_mlock mlock_buf;
+    std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
     llama_mlock mlock_mmap;
 
     // for quantize-stats only
@@ -3449,7 +3449,12 @@ static bool llm_load_tensors(
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        if (lm_gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab});
+                        } else {
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
+                            ml.n_created--; // artificial tensor
+                        }
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
@@ -3826,8 +3831,10 @@ static bool llm_load_tensors(
         else {
             buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf != nullptr && use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
-                model.mlock_buf.init   (lm_ggml_backend_buffer_get_base(buf));
-                model.mlock_buf.grow_to(lm_ggml_backend_buffer_get_size(buf));
+                model.mlock_bufs.emplace_back(new llama_mlock);
+                auto & mlock_buf = model.mlock_bufs.back();
+                mlock_buf->init   (lm_ggml_backend_buffer_get_base(buf));
+                mlock_buf->grow_to(lm_ggml_backend_buffer_get_size(buf));
             }
         }
         if (buf == nullptr) {

diff --git a/example/src/App.tsx b/example/src/App.tsx
@@ -105,7 +105,7 @@ export default function App() {
     initLlama({
       model: file.uri,
       use_mlock: true,
-      n_gpu_layers: 0, // > 0: enable GPU
+      n_gpu_layers: Platform.OS === 'ios' ? 1 : 0, // > 0: enable GPU
       // embedding: true,
     })
       .then((ctx) => {

diff --git a/llama.cpp b/llama.cpp
diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch
@@ -1,6 +1,6 @@
---- ggml-metal.m.orig	2024-01-18 11:47:27
-+++ ggml-metal.m	2024-01-18 11:47:29
-@@ -290,7 +290,7 @@
+--- ggml-metal.m.orig	2024-01-19 10:06:53
++++ ggml-metal.m	2024-01-19 10:06:54
+@@ -288,7 +288,7 @@
              if (ggmlMetalPathResources) {
                  sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
              } else {
+1 −1		CMakeLists.txt
+10 −0		common/common.cpp
+3 −0		common/common.h
+1 −1		convert.py
+1 −1		examples/imatrix/imatrix.cpp
+425 −103		examples/perplexity/perplexity.cpp
+9 −3		examples/server/server.cpp
+6 −10		ggml-metal.m
+11 −4		llama.cpp
+10 −0		scripts/get-hellaswag.sh
+7 −0		scripts/get-wikitext-2.sh
+10 −0		scripts/get-winogrande.sh