From 591b1c17b57545e2ac342d3761727fb7247a57bd Mon Sep 17 00:00:00 2001
From: TDM <cto.junioor@gmail.com>
Date: Sat, 17 Jun 2023 15:12:27 +0530
Subject: [PATCH 1/5] WIP: Add ability to remove loras and save them in the
 context cache

---
 examples/addon.node/addon.cpp |   5 +-
 llama.cpp                     | 274 +++++++++++++++++++++++++++++++++-
 llama.h                       |  21 +++
 3 files changed, 295 insertions(+), 5 deletions(-)
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index cae06cc9e29e0..00e19d677d16a 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -61,7 +61,10 @@ Napi::Number swapLora(const Napi::CallbackInfo &info)
   fprintf(stderr, "Acquiring lock\n");
   worker_mutex.lock();
 
-  fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str());
+  fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str());
+  llama_remove_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
+
+  fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str());
   llama_apply_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
 
   worker_mutex.unlock();
diff --git a/llama.cpp b/llama.cpp
index 4a7d01b3297b2..71ae271f117dd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -52,6 +52,9 @@
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
+#include <openssl/md5.h>
+#include <iomanip>
+
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,
@@ -278,6 +281,13 @@ struct llama_context {
     llama_ctx_buffer buf_compute;
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
+    // cache lora adapter weights, specifically s*BA matmul
+    // key -> base model layer name, value -> s*BA
+    // directly caching the matmul to avoid calculations at runtime
+    // for now only caches the last loaded adapter weights and assumes they'll be overwritten by next call
+    // TODO: Free these weights on deconstructor
+    std::unordered_map<std::string, std::vector<float>> adapter_weights;  
+
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
 #endif
@@ -873,6 +883,54 @@ struct llama_model_loader {
 
 };
 
+const double epsilon = 1e-4;
+
+bool isNotZeroFloat(float val) {
+    return std::fabs(val) > epsilon;
+}
+
+void print_ggml_float_tensor(const struct ggml_tensor * tensor, std::string base_name, std::string metadata, int max_elements_to_print) {
+    fprintf(stderr, "Layer Name: %s, Metadata: %s\n", base_name.c_str(), metadata.c_str());
+
+    fprintf(stderr, "Tensor Type %d\n", tensor->type);
+    int64_t num_elements = ggml_nelements(tensor);
+
+    fprintf(stderr, "Data Length Num Elements %ld\n", num_elements);
+    fprintf(stderr, "Data Length Bytes %ld\n", ggml_nbytes(tensor));
+
+    void* data;
+    float* dequantized_data;
+    dequantize_row_q_t dequantize_row_fn;
+
+    int64_t num_elements_to_print = max_elements_to_print > 0 ? max_elements_to_print : num_elements;
+
+    if (tensor->type == 0) { // F32 
+        data = reinterpret_cast<float*>(tensor->data);
+        for (int i = 0; i < num_elements_to_print; ++i) {
+            fprintf(stderr, "%d : %f\n", i, ((float*)data)[i]);
+        }
+    } else if (tensor->type == 1) { // F16
+        data = reinterpret_cast<ggml_fp16_t*>(tensor->data);
+        for (int i = 0; i < num_elements_to_print; ++i) {
+            fprintf(stderr, "%d : %f\n", i, ggml_fp16_to_fp32(((ggml_fp16_t*)data)[i]));
+        }
+    } else { //Quantized data
+        // dequantised data
+        dequantized_data = (float*)malloc(num_elements_to_print * sizeof(float));
+
+        dequantize_row_fn = ggml_internal_get_quantize_fn(tensor->type).dequantize_row_q;
+        if (dequantize_row_fn == nullptr) {
+            fprintf(stderr, "Quantization type %d not supported for print tensors\n", tensor->type);
+            return;
+        }
+        dequantize_row_fn(tensor->data, dequantized_data, num_elements_to_print);
+        for (int i = 0; i < num_elements_to_print; ++i) {
+            fprintf(stderr, "%d : %f\n", i, dequantized_data[i]);
+        }
+    }
+
+    fprintf(stderr, "\n");
+}
 
 //
 // kv cache
@@ -2765,8 +2823,168 @@ int llama_model_quantize(
     }
 }
 
-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
-    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+// Set `deactivate_adapter` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*>& cached_tensors, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
+    if (deactivate_adapter) {
+        fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__);
+    } else {
+        fprintf(stderr, "%s: applying lora adapter - please wait ...\n", __func__);
+    }
+
+    auto & model = ctx->model;
+
+    //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
+
+    std::unordered_map<std::string, std::vector<float>> lora_tensors = ctx->adapter_weights;
+
+    if (lora_tensors.size() == 0) {
+        fprintf(stderr, "%s: no tensors to apply\n", __func__);
+        return 0;
+    }
+
+    const int64_t t_start_lora_us = ggml_time_us();
+
+    // create a temporary ggml context to store the lora tensors
+    // todo: calculate size from biggest possible tensor
+    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+    struct ggml_init_params params;
+    params.mem_size   = lora_buf.size();
+    params.mem_buffer = lora_buf.data();
+    params.no_alloc   = false;
+
+    ggml_context * lora_ctx = ggml_init(params);
+
+    // create a name -> tensor map of the model to accelerate lookups
+    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
+    for (auto & kv: model.tensors_by_name) {
+        model_tensors.insert(kv);
+    }
+
+
+    // load base model
+    std::unique_ptr<llama_model_loader> model_loader;
+    ggml_context * base_ctx = NULL;
+    llama_buffer base_buf;
+    if (path_base_model) {
+        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
+        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
+
+        size_t ctx_size;
+        size_t mmapped_size;
+        model_loader->calc_sizes(&ctx_size, &mmapped_size);
+        base_buf.resize(ctx_size);
+
+        ggml_init_params base_params;
+        base_params.mem_size   = base_buf.size;
+        base_params.mem_buffer = base_buf.addr;
+        base_params.no_alloc   = model_loader->use_mmap;
+
+        base_ctx = ggml_init(base_params);
+
+        model_loader->ggml_ctx = base_ctx;
+
+        // maybe this should in llama_model_loader
+        if (model_loader->use_mmap) {
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
+        }
+    }
+
+    // read tensors and apply
+    bool warned = false;
+    int n_tensors = 0;
+
+    for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) {
+        const std::string& base_name = it->first;
+        struct std::vector<float> BA_vector = it->second;
+        ggml_tensor* BA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, BA_vector.size() / 4096, 4096);
+        BA->data = (float*) BA_vector.data();
+
+        // check if we have both A and B tensors and apply
+        if (model_tensors.find(base_name) == model_tensors.end()) {
+            fprintf(stderr, "%s: error: tensor '%s' not found in model\n", __func__, base_name.c_str());
+            continue;
+        }
+
+        ggml_tensor * dest_t = model_tensors[base_name];
+        ggml_tensor * base_t;
+        if (model_loader) {
+            // load from base model
+            if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
+                fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+                return 1;
+            }
+            size_t idx = model_loader->tensors_map.name_to_idx[base_name];
+            llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
+            base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
+            lt.data = (uint8_t *) lt.ggml_tensor->data;
+            model_loader->load_data_for(lt);
+            lt.ggml_tensor->data = lt.data;
+        }
+        else {
+            base_t = dest_t;
+        }
+
+        if (ggml_is_quantized(base_t->type)) {
+            if (!warned) {
+                fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                                "use a f16 or f32 base model with --lora-base\n", __func__);
+                warned = true;
+            }
+        }
+
+        // w = w - BAs to unload the model
+        // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
+        // obvious improvement would be just caching the weights on load and using them on unload
+        if (deactivate_adapter) {
+            BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
+        }
+
+        ggml_tensor * r;
+        if (base_t == dest_t) {
+            r = ggml_add_inplace(lora_ctx, dest_t, BA);
+        }
+        else {
+            r = ggml_add(lora_ctx, base_t, BA);
+            r = ggml_cpy(lora_ctx, r, dest_t);
+        }
+
+        struct ggml_cgraph gf = ggml_build_forward(r);
+        gf.n_threads = n_threads;
+        ggml_graph_compute(lora_ctx, &gf);
+
+    
+        // we won't need these tensors again, reset the context to save memory
+        ggml_free(lora_ctx);
+        lora_ctx = ggml_init(params);
+
+        n_tensors++;
+        if (n_tensors % 4 == 0) {
+            fprintf(stderr, ".");
+        }
+    }
+
+    // TODO: this should be in a destructor, it will leak on failure
+    ggml_free(lora_ctx);
+    if (base_ctx) {
+        ggml_free(base_ctx);
+    }
+
+    //TODO: Decide if I want to clear up the lora cache or not
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
+
+    return 0;
+}
+
+
+// Set `deactivate_adapter` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
+    if (deactivate_adapter) {
+        fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__);
+    } else {
+        fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+    }
 
     auto & model = ctx->model;
 
@@ -2853,6 +3071,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
     // read tensors and apply
     bool warned = false;
     int n_tensors = 0;
+
     while (true) {
         int32_t n_dims;
         int32_t length;
@@ -2965,7 +3184,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                 return 1;
             }
 
-            // w = w + BA*s
+            // BA*s
             ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
             if (scaling != 1.0f) {
@@ -2973,6 +3192,13 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                 BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
             }
 
+            // w = w - BAs to unload the model
+            // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
+            // obvious improvement would be just caching the weights on load and using them on unload
+            if (deactivate_adapter) {
+                BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
+            }
+
             ggml_tensor * r;
             if (base_t == dest_t) {
                 r = ggml_add_inplace(lora_ctx, dest_t, BA);
@@ -2986,6 +3212,19 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
+            if (!deactivate_adapter) {
+                const int64_t t_copy_lora_us = ggml_time_us();
+
+                // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart
+                float* BA_data = (float *) BA->data; // can do this cause BA is guaranteed to be F32 for now
+                
+                std::vector<float> BA_data_copy(BA_data, BA_data + ggml_nelements(BA));
+                ctx->adapter_weights[base_name] = BA_data_copy;
+
+                const int64_t t_copy_lora_us_end = ggml_time_us();
+
+                fprintf(stderr, "%s: copied lora tensor '%s' in %.2f ms\n", __func__, base_name.c_str(), (t_copy_lora_us_end - t_copy_lora_us) / 1000.0);
+            }
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
             lora_ctx = ggml_init(params);
@@ -3012,7 +3251,34 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
 
 int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, false);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+
+int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+    try {
+        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting deactivate_adapter to true
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: failed to remove lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+
+int llama_apply_lora_from_cache(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*> lora_cache, const char * path_base_model, int n_threads) {
+    try {
+        return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, false);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+
+int llama_remove_lora_from_cache(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*> lora_cache, const char * path_base_model, int n_threads) {
+    try {
+        return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, true);
     } catch (const std::exception & err) {
         fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return 1;
diff --git a/llama.h b/llama.h
index 1241ba6c0ec44..e38cd0c732814 100644
--- a/llama.h
+++ b/llama.h
@@ -11,6 +11,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
+#include <cstring>
+#include <string>
+#include <unordered_map>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -164,6 +167,24 @@ extern "C" {
                       const char * path_lora,
                       const char * path_base_model,
                              int   n_threads);
+    
+    LLAMA_API int llama_remove_lora_from_file(
+        struct llama_context * ctx,
+                    const char * path_lora,
+                    const char * path_base_model,
+                            int   n_threads);
+    
+    LLAMA_API int llama_apply_lora_from_cache(
+        struct llama_context * ctx,
+                    std::unordered_map< std::string, ggml_tensor* > lora_cache,
+                    const char * path_base_model,
+                            int   n_threads);
+    
+    LLAMA_API int llama_remove_lora_from_cache(
+        struct llama_context * ctx,
+                    std::unordered_map< std::string, ggml_tensor* > lora_cache,
+                    const char * path_base_model,
+                            int   n_threads);
 
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

From 9f7ec8f238ba1a0581f5b3f3a456bb0d75e23f11 Mon Sep 17 00:00:00 2001
From: TDM <cto.junioor@gmail.com>
Date: Sat, 17 Jun 2023 17:57:44 +0530
Subject: [PATCH 2/5] Cache individual adapters instead of matmul result

---
 llama.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 71ae271f117dd..0eb39abbd9992 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2893,6 +2893,7 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde
     bool warned = false;
     int n_tensors = 0;
 
+    //TODO: Fix this to process loraA and loraB seperately
     for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) {
         const std::string& base_name = it->first;
         struct std::vector<float> BA_vector = it->second;
@@ -3212,19 +3213,19 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
+            //TODO: Should use a lora_cache struct where we can also store matrix dimensions & other metadata that are required for calculations once load
             if (!deactivate_adapter) {
-                const int64_t t_copy_lora_us = ggml_time_us();
-
                 // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart
-                float* BA_data = (float *) BA->data; // can do this cause BA is guaranteed to be F32 for now
+                float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now
                 
-                std::vector<float> BA_data_copy(BA_data, BA_data + ggml_nelements(BA));
-                ctx->adapter_weights[base_name] = BA_data_copy;
-
-                const int64_t t_copy_lora_us_end = ggml_time_us();
+                std::vector<float> loraA_data_copy(loraA_data, loraA_data + ggml_nelements(loraA));
+                ctx->adapter_weights[base_name + ".loraA"] = loraA_data_copy;
 
-                fprintf(stderr, "%s: copied lora tensor '%s' in %.2f ms\n", __func__, base_name.c_str(), (t_copy_lora_us_end - t_copy_lora_us) / 1000.0);
+                float* loraB_data = (float *) loraB->data; // can do this cause lora is guaranteed to be F32 for now
+                std::vector<float> loraB_data_copy(loraB_data, loraB_data + ggml_nelements(loraB));
+                ctx->adapter_weights[base_name + ".loraB"] = loraB_data_copy;
             }
+
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
             lora_ctx = ggml_init(params);

From edf7d285b550881c171438efb87fecb18b4348e4 Mon Sep 17 00:00:00 2001
From: TDM <cto.junioor@gmail.com>
Date: Sat, 17 Jun 2023 20:49:24 +0530
Subject: [PATCH 3/5] Cache metadata of individual adapaters and change the
 load method to use cache

---
 examples/addon.node/addon.cpp |   4 +-
 llama.cpp                     | 114 ++++++++++++++++++++++++++--------
 llama.h                       |   4 +-
 3 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index 00e19d677d16a..ac3f115f58e4b 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -62,10 +62,10 @@ Napi::Number swapLora(const Napi::CallbackInfo &info)
   worker_mutex.lock();
 
   fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str());
-  llama_remove_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
+  llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
 
   fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str());
-  llama_apply_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
+  llama_apply_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
 
   worker_mutex.unlock();
   return Napi::Number::New(info.Env(), 0);  
diff --git a/llama.cpp b/llama.cpp
index 0eb39abbd9992..c3e2aad895148 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -249,6 +249,26 @@ struct llama_vocab {
     std::vector<token_score> id_to_token;
 };
 
+struct lora_metadata {
+    std::string name;
+    int64_t ne[2];
+    ggml_type type;
+};
+
+// TODO: move outside llama to addon.cpp for easy merge in the future
+// contains layer name to weight mapping
+struct lora_adapter_weights_map {
+    int32_t lora_alpha;
+    int32_t lora_r;
+    float scaling;
+
+    std::unordered_map<std::string, lora_metadata*> lora_metadata_map;
+    std::unordered_map<std::string, std::vector<float> > loraA_weights;
+    std::unordered_map<std::string, std::vector<float> > loraB_weights; 
+    // strings are being kept thrice, mem usage can be reduced further using a single map
+};
+
+
 struct llama_context {
     std::mt19937 rng;
 
@@ -282,11 +302,10 @@ struct llama_context {
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
     // cache lora adapter weights, specifically s*BA matmul
-    // key -> base model layer name, value -> s*BA
-    // directly caching the matmul to avoid calculations at runtime
-    // for now only caches the last loaded adapter weights and assumes they'll be overwritten by next call
+    // key -> lora model path
     // TODO: Free these weights on deconstructor
-    std::unordered_map<std::string, std::vector<float>> adapter_weights;  
+    std::unordered_map<std::string, lora_adapter_weights_map* > lora_cache;
+
 
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
@@ -2824,23 +2843,23 @@ int llama_model_quantize(
 }
 
 // Set `deactivate_adapter` to true if you want to remove the adapter from the model.
-int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*>& cached_tensors, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
+int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * lora_path, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
     if (deactivate_adapter) {
-        fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__);
+        fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__);
     } else {
-        fprintf(stderr, "%s: applying lora adapter - please wait ...\n", __func__);
+        fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__);
     }
 
     auto & model = ctx->model;
 
     //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
 
-    std::unordered_map<std::string, std::vector<float>> lora_tensors = ctx->adapter_weights;
-
-    if (lora_tensors.size() == 0) {
-        fprintf(stderr, "%s: no tensors to apply\n", __func__);
-        return 0;
+    if (ctx->lora_cache.find(lora_path) == ctx->lora_cache.end()) {
+        fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, lora_path);
+        return 1;
     }
+    
+    lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[lora_path];
 
     const int64_t t_start_lora_us = ggml_time_us();
 
@@ -2893,12 +2912,22 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde
     bool warned = false;
     int n_tensors = 0;
 
+    float scaling = cached_lora_adapter->scaling;
+
     //TODO: Fix this to process loraA and loraB seperately
-    for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) {
+    for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) {
         const std::string& base_name = it->first;
-        struct std::vector<float> BA_vector = it->second;
-        ggml_tensor* BA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, BA_vector.size() / 4096, 4096);
-        BA->data = (float*) BA_vector.data();
+        const lora_metadata* metadata = it->second;
+        std::vector<float> loraA_vec = cached_lora_adapter->loraA_weights[base_name];
+        std::vector<float> loraB_vec = cached_lora_adapter->loraB_weights[base_name];
+        int ne0 = metadata->ne[0];
+        int ne1 = metadata->ne[1];
+
+        ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32
+        ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1);
+
+        loraA->data = (float*) loraA_vec.data();
+        loraB->data = (float*) loraB_vec.data();
 
         // check if we have both A and B tensors and apply
         if (model_tensors.find(base_name) == model_tensors.end()) {
@@ -2933,6 +2962,15 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde
             }
         }
 
+        // BA*s
+        ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+
+        if (scaling != 1.0f) {
+            ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+            BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
+        }
+
+        
         // w = w - BAs to unload the model
         // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
         // obvious improvement would be just caching the weights on load and using them on unload
@@ -2953,6 +2991,14 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde
         gf.n_threads = n_threads;
         ggml_graph_compute(lora_ctx, &gf);
 
+        if (base_name == "layers.0.attention.wk.weight") {
+            std::string print_metadata = "Applying lora from cache";
+            if (deactivate_adapter) {
+                print_metadata = "Deactivating lora from cache";
+            }
+            print_ggml_float_tensor(dest_t, "layers.0.attention.wk.weight", print_metadata, 10);
+        
+        }
     
         // we won't need these tensors again, reset the context to save memory
         ggml_free(lora_ctx);
@@ -3073,6 +3119,11 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
     bool warned = false;
     int n_tensors = 0;
 
+    lora_adapter_weights_map* lora_adapter_weights = new lora_adapter_weights_map();
+    lora_adapter_weights->lora_alpha = lora_alpha;
+    lora_adapter_weights->lora_r = lora_r;
+    lora_adapter_weights->scaling = scaling;
+
     while (true) {
         int32_t n_dims;
         int32_t length;
@@ -3213,17 +3264,23 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
-            //TODO: Should use a lora_cache struct where we can also store matrix dimensions & other metadata that are required for calculations once load
             if (!deactivate_adapter) {
                 // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart
-                float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now
                 
+                float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now
                 std::vector<float> loraA_data_copy(loraA_data, loraA_data + ggml_nelements(loraA));
-                ctx->adapter_weights[base_name + ".loraA"] = loraA_data_copy;
 
                 float* loraB_data = (float *) loraB->data; // can do this cause lora is guaranteed to be F32 for now
                 std::vector<float> loraB_data_copy(loraB_data, loraB_data + ggml_nelements(loraB));
-                ctx->adapter_weights[base_name + ".loraB"] = loraB_data_copy;
+
+                lora_metadata* lora_layer_metadata = new lora_metadata();
+                lora_layer_metadata->ne[0] = loraA->ne[0];
+                lora_layer_metadata->ne[1] = loraA->ne[1];
+                lora_layer_metadata->type = loraA->type;
+
+                lora_adapter_weights->loraA_weights[base_name] = loraA_data_copy;
+                lora_adapter_weights->loraB_weights[base_name] = loraB_data_copy;
+                lora_adapter_weights->lora_metadata_map[base_name] = lora_layer_metadata;
             }
 
             // we won't need these tensors again, reset the context to save memory
@@ -3244,6 +3301,10 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
         ggml_free(base_ctx);
     }
 
+    if (!deactivate_adapter) {
+        ctx->lora_cache[path_lora] = lora_adapter_weights;
+    }
+
     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
     fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
 
@@ -3268,20 +3329,21 @@ int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lo
     }
 }
 
-int llama_apply_lora_from_cache(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*> lora_cache, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_cache(struct llama_context * ctx, const char * path_lora ,const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, false);
+        return llama_apply_lora_from_cache_internal(ctx, path_lora, path_base_model, n_threads, false);
     } catch (const std::exception & err) {
-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        fprintf(stderr, "%s: failed to apply cached lora adapter: %s\n", __func__, err.what());
         return 1;
     }
 }
 
-int llama_remove_lora_from_cache(struct llama_context * ctx, std::unordered_map<std::string, ggml_tensor*> lora_cache, const char * path_base_model, int n_threads) {
+
+int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, true);
+        return llama_apply_lora_from_cache_internal(ctx, path_lora, path_base_model, n_threads, true);
     } catch (const std::exception & err) {
-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what());
         return 1;
     }
 }
diff --git a/llama.h b/llama.h
index e38cd0c732814..d0d53d1a91cdc 100644
--- a/llama.h
+++ b/llama.h
@@ -176,13 +176,13 @@ extern "C" {
     
     LLAMA_API int llama_apply_lora_from_cache(
         struct llama_context * ctx,
-                    std::unordered_map< std::string, ggml_tensor* > lora_cache,
+                    const char * path_lora,
                     const char * path_base_model,
                             int   n_threads);
     
     LLAMA_API int llama_remove_lora_from_cache(
         struct llama_context * ctx,
-                    std::unordered_map< std::string, ggml_tensor* > lora_cache,
+                    const char * path_lora,
                     const char * path_base_model,
                             int   n_threads);
 

From 1309f2504323e3f386a714d97adeba7a009bbff0 Mon Sep 17 00:00:00 2001
From: TDM <cto.junioor@gmail.com>
Date: Mon, 19 Jun 2023 23:19:25 +0530
Subject: [PATCH 4/5] Add ability to swap lora in a single method by combining
 graph calculations

---
 examples/addon.node/addon.cpp |   8 +-
 llama.cpp                     | 307 +++++++++++++++++++++++++++++++---
 llama.h                       |   3 +
 3 files changed, 292 insertions(+), 26 deletions(-)

diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index ac3f115f58e4b..3e8580e4fc031 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -61,11 +61,11 @@ Napi::Number swapLora(const Napi::CallbackInfo &info)
   fprintf(stderr, "Acquiring lock\n");
   worker_mutex.lock();
 
-  fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str());
-  llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
+  // fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str());
+  // llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
 
-  fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str());
-  llama_apply_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
+  fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str());
+  llama_swap_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores(), lora.c_str());
 
   worker_mutex.unlock();
   return Napi::Number::New(info.Env(), 0);  
diff --git a/llama.cpp b/llama.cpp
index c3e2aad895148..08c081e27ce6e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -305,6 +305,7 @@ struct llama_context {
     // key -> lora model path
     // TODO: Free these weights on deconstructor
     std::unordered_map<std::string, lora_adapter_weights_map* > lora_cache;
+    std::unordered_map<std::string, bool> loaded_loras;
 
 
 #ifdef GGML_USE_METAL
@@ -2842,9 +2843,237 @@ int llama_model_quantize(
     }
 }
 
-// Set `deactivate_adapter` to true if you want to remove the adapter from the model.
-int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * lora_path, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
-    if (deactivate_adapter) {
+int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) {
+    int64_t t_lora_cache_us = ggml_time_us();
+
+    auto & model = ctx->model;
+
+    //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
+
+    if (ctx->lora_cache.find(path_lora_to_apply) == ctx->lora_cache.end()) {
+        fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora_to_apply);
+        return 1;
+    }
+    
+    lora_adapter_weights_map* cached_lora_adapter_apply = ctx->lora_cache[path_lora_to_apply];
+    lora_adapter_weights_map* cached_lora_adapter_remove = ctx->lora_cache[path_lora_to_remove];
+
+    if (cached_lora_adapter_apply->loraA_weights.size() == 0) {
+        fprintf(stderr, "%s: error: cached lora '%s' is empty\n", __func__, path_lora_to_apply);
+        return 1;
+    }
+
+    if (cached_lora_adapter_remove->loraA_weights.size() == 0) {
+        fprintf(stderr, "%s: error: cached lora '%s' is empty\n", __func__, path_lora_to_remove);
+        return 1;
+    }
+
+    const int64_t t_start_lora_us = ggml_time_us();
+
+    // create a temporary ggml context to store the lora tensors
+    // todo: calculate size from biggest possible tensor
+    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+    struct ggml_init_params params;
+    params.mem_size   = lora_buf.size();
+    params.mem_buffer = lora_buf.data();
+    params.no_alloc   = false;
+
+    ggml_context * lora_ctx = ggml_init(params);
+
+    // create a name -> tensor map of the model to accelerate lookups
+    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
+    for (auto & kv: model.tensors_by_name) {
+        model_tensors.insert(kv);
+    }
+
+    // read tensors and apply
+    bool warned = false;
+    int n_tensors = 0;
+
+    //TODO: Fix this to process loraA and loraB seperately
+    for (auto it = model_tensors.begin(); it != model_tensors.end(); ++it) {
+        t_lora_cache_us = ggml_time_us();
+        const std::string& base_name = it->first;
+        ggml_tensor * dest_t = it->second;
+
+        ggml_tensor * r;
+        ggml_tensor* BA_apply;
+        ggml_tensor* BA_remove;
+
+
+        bool lora_found = false;
+        // check if we have both A and B tensors and apply
+        if (cached_lora_adapter_apply->loraA_weights.find(base_name) != cached_lora_adapter_apply->loraA_weights.end() ||
+            cached_lora_adapter_apply->loraB_weights.find(base_name) != cached_lora_adapter_apply->loraB_weights.end()) {
+            std::vector<float> loraA_vec = cached_lora_adapter_apply->loraA_weights[base_name];
+            std::vector<float> loraB_vec = cached_lora_adapter_apply->loraB_weights[base_name];
+            float scaling = cached_lora_adapter_apply->scaling;
+
+            lora_metadata* metadata = cached_lora_adapter_apply->lora_metadata_map[base_name];
+
+            int ne0 = metadata->ne[0];
+            int ne1 = metadata->ne[1];
+
+            ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32
+            ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1);
+
+            loraA->data = (float*) loraA_vec.data();
+            loraB->data = (float*) loraB_vec.data();
+
+            ggml_tensor * base_t;
+
+            base_t = dest_t;
+
+            if (ggml_is_quantized(base_t->type)) {
+                if (!warned) {
+                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                                    "use a f16 or f32 base model with --lora-base\n", __func__);
+                    warned = true;
+                }
+            }
+
+            // BA*s
+            BA_apply = ggml_mul_mat(lora_ctx, loraA, loraB);
+
+            //print time taken till now
+            // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
+            if (scaling != 1.0f) {
+                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                BA_apply = ggml_scale_inplace(lora_ctx, BA_apply, scale_tensor);
+            }
+
+            lora_found = true;
+        }
+
+        // check if we have both A and B tensors and remove
+        if (cached_lora_adapter_remove->loraA_weights.find(base_name) != cached_lora_adapter_remove->loraA_weights.end() ||
+            cached_lora_adapter_remove->loraB_weights.find(base_name) != cached_lora_adapter_remove->loraB_weights.end()) {
+            std::vector<float> loraA_vec = cached_lora_adapter_remove->loraA_weights[base_name];
+            std::vector<float> loraB_vec = cached_lora_adapter_remove->loraB_weights[base_name];
+            float scaling = cached_lora_adapter_remove->scaling;
+
+            lora_metadata* metadata = cached_lora_adapter_remove->lora_metadata_map[base_name];
+
+            int ne0 = metadata->ne[0];
+            int ne1 = metadata->ne[1];
+
+            ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32
+            ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1);
+
+            loraA->data = (float*) loraA_vec.data();
+            loraB->data = (float*) loraB_vec.data();
+
+            ggml_tensor * base_t;
+
+            base_t = dest_t;
+
+            if (ggml_is_quantized(base_t->type)) {
+                if (!warned) {
+                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                                    "use a f16 or f32 base model with --lora-base\n", __func__);
+                    warned = true;
+                }
+            }
+
+            // BA*s
+            BA_remove = ggml_mul_mat(lora_ctx, loraA, loraB);
+
+            //print time taken till now
+            // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
+            scaling = -1.0f * scaling;
+            if (scaling != 1.0f) {
+                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                BA_remove = ggml_scale_inplace(lora_ctx, BA_remove, scale_tensor);
+            }
+
+            // BA_remove = ggml_neg(lora_ctx, BA_remove); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
+            lora_found = true;
+        }
+
+        
+        if (!lora_found) {
+            // fprintf(stderr, "%s: error: No lora tensors found for layer '%s'\n", __func__, base_name.c_str());
+            continue;
+        }
+
+         //print time taken till now
+        // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+        
+        // w = w - BAs to unload the model
+        // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
+        // obvious improvement would be just caching the weights on load and using them on unload
+
+        //print time taken till now
+        // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
+        
+
+         //print time taken till now
+        // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+        ggml_tensor* BA;
+        if (BA_apply && BA_remove) {
+            BA = ggml_add(lora_ctx, BA_apply, BA_remove);
+        } else if (BA_apply) {
+            BA = BA_apply;
+        } else {
+            BA = BA_remove;
+        }
+
+        r = ggml_add(lora_ctx, dest_t, BA);
+
+        struct ggml_cgraph gf = ggml_build_forward(r);
+
+         //print time taken till now
+        // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
+        gf.n_threads = n_threads;
+        ggml_graph_compute(lora_ctx, &gf);
+
+         //print time taken till now
+        // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+    
+        // we won't need these tensors again, reset the context to save memory
+        ggml_free(lora_ctx);
+        lora_ctx = ggml_init(params);
+
+        n_tensors++;
+        if (n_tensors % 4 == 0) {
+            fprintf(stderr, ".");
+        }
+
+        if (base_name == "layers.0.attention.wk.weight") {
+            // print_tensor(r);
+            print_ggml_float_tensor(dest_t, "TEST", "TEST", 10);
+        }
+
+        //print time taken till now
+        // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+    }
+
+    // TODO: this should be in a destructor, it will leak on failure
+    ggml_free(lora_ctx);
+
+    if (path_lora_to_remove) {
+        ctx->loaded_loras.erase(path_lora_to_remove);
+    }
+
+    ctx->loaded_loras[path_lora_to_apply] = true;
+
+    //TODO: Decide if I want to clear up the lora cache or not
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
+
+    return 0;
+}
+
+// Set `remove_existing` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) {
+    int64_t t_lora_cache_us = ggml_time_us();
+
+    if (remove_existing) {
         fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__);
     } else {
         fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__);
@@ -2854,12 +3083,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 
     //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
 
-    if (ctx->lora_cache.find(lora_path) == ctx->lora_cache.end()) {
-        fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, lora_path);
+    if (ctx->lora_cache.find(path_lora) == ctx->lora_cache.end()) {
+        fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora);
         return 1;
     }
     
-    lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[lora_path];
+    lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[path_lora];
 
     const int64_t t_start_lora_us = ggml_time_us();
 
@@ -2879,6 +3108,8 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         model_tensors.insert(kv);
     }
 
+    //print time taken till now
+    //fprintf(stderr, "%s: time taken till copying base model weights = %8.2f ms\n", __func__, (ggml_time_us() - t_lora_cache_us)/1000.0);
 
     // load base model
     std::unique_ptr<llama_model_loader> model_loader;
@@ -2916,6 +3147,7 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 
     //TODO: Fix this to process loraA and loraB seperately
     for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) {
+        t_lora_cache_us = ggml_time_us();
         const std::string& base_name = it->first;
         const lora_metadata* metadata = it->second;
         std::vector<float> loraA_vec = cached_lora_adapter->loraA_weights[base_name];
@@ -2965,19 +3197,27 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         // BA*s
         ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
+         //print time taken till now
+        // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
         if (scaling != 1.0f) {
             ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
             BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
         }
 
+         //print time taken till now
+        // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
         
         // w = w - BAs to unload the model
         // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
         // obvious improvement would be just caching the weights on load and using them on unload
-        if (deactivate_adapter) {
+        if (remove_existing) {
             BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
         }
 
+        //print time taken till now
+        // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
         ggml_tensor * r;
         if (base_t == dest_t) {
             r = ggml_add_inplace(lora_ctx, dest_t, BA);
@@ -2987,18 +3227,19 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
             r = ggml_cpy(lora_ctx, r, dest_t);
         }
 
+         //print time taken till now
+        // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
         struct ggml_cgraph gf = ggml_build_forward(r);
+
+         //print time taken till now
+        // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+
         gf.n_threads = n_threads;
         ggml_graph_compute(lora_ctx, &gf);
 
-        if (base_name == "layers.0.attention.wk.weight") {
-            std::string print_metadata = "Applying lora from cache";
-            if (deactivate_adapter) {
-                print_metadata = "Deactivating lora from cache";
-            }
-            print_ggml_float_tensor(dest_t, "layers.0.attention.wk.weight", print_metadata, 10);
-        
-        }
+         //print time taken till now
+        // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
     
         // we won't need these tensors again, reset the context to save memory
         ggml_free(lora_ctx);
@@ -3008,6 +3249,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         if (n_tensors % 4 == 0) {
             fprintf(stderr, ".");
         }
+
+        //print time taken till now
+        // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
     }
 
     // TODO: this should be in a destructor, it will leak on failure
@@ -3016,6 +3260,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         ggml_free(base_ctx);
     }
 
+    if (remove_existing) {
+        ctx->loaded_loras.erase(path_lora);
+    } else {
+        ctx->loaded_loras[path_lora] = true;
+    }
+
     //TODO: Decide if I want to clear up the lora cache or not
 
     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
@@ -3025,9 +3275,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 }
 
 
-// Set `deactivate_adapter` to true if you want to remove the adapter from the model.
-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool deactivate_adapter) {
-    if (deactivate_adapter) {
+// Set `remove_existing` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) {
+    if (remove_existing) {
         fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__);
     } else {
         fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -3247,7 +3497,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             // w = w - BAs to unload the model
             // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
             // obvious improvement would be just caching the weights on load and using them on unload
-            if (deactivate_adapter) {
+            if (remove_existing) {
                 BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
             }
 
@@ -3264,7 +3514,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
-            if (!deactivate_adapter) {
+            if (!remove_existing) {
                 // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart
                 
                 float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now
@@ -3301,7 +3551,10 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
         ggml_free(base_ctx);
     }
 
-    if (!deactivate_adapter) {
+    if (remove_existing) {
+        ctx->loaded_loras.erase(path_lora);
+    } else {
+        ctx->loaded_loras[path_lora] = true;
         ctx->lora_cache[path_lora] = lora_adapter_weights;
     }
 
@@ -3322,7 +3575,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
 
 int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting deactivate_adapter to true
+        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting remove_existing to true
     } catch (const std::exception & err) {
         fprintf(stderr, "%s: failed to remove lora adapter: %s\n", __func__, err.what());
         return 1;
@@ -3348,6 +3601,16 @@ int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_l
     }
 }
 
+int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) {
+    try {
+        return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, path_base_model, n_threads, path_lora_to_remove);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+
+
 int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->model.kv_self.n;
 }
diff --git a/llama.h b/llama.h
index d0d53d1a91cdc..9a27d59bd62dc 100644
--- a/llama.h
+++ b/llama.h
@@ -185,6 +185,9 @@ extern "C" {
                     const char * path_lora,
                     const char * path_base_model,
                             int   n_threads);
+    
+    LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove);
+
 
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

From 9ab1248c7d24fc4013a6fb3474757a789ef08a97 Mon Sep 17 00:00:00 2001
From: TDM <cto.junioor@gmail.com>
Date: Tue, 20 Jun 2023 20:25:37 +0530
Subject: [PATCH 5/5] Improve lora swap times by 300ms by removing neg ops,
 plus code cleanup

---
 examples/addon.node/addon.cpp |   5 +-
 llama.cpp                     | 139 +++++++++-------------------------
 llama.h                       |   2 +-
 3 files changed, 37 insertions(+), 109 deletions(-)

diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index 3e8580e4fc031..1e4faf25473c3 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -61,11 +61,8 @@ Napi::Number swapLora(const Napi::CallbackInfo &info)
   fprintf(stderr, "Acquiring lock\n");
   worker_mutex.lock();
 
-  // fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str());
-  // llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores());
-
   fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str());
-  llama_swap_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores(), lora.c_str());
+  llama_swap_lora_from_cache(g_ctx, lora.c_str(), get_num_physical_cores(), lora.c_str());
 
   worker_mutex.unlock();
   return Napi::Number::New(info.Env(), 0);  
diff --git a/llama.cpp b/llama.cpp
index 08c081e27ce6e..9cbdc1d7052e8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -934,7 +934,9 @@ void print_ggml_float_tensor(const struct ggml_tensor * tensor, std::string base
         for (int i = 0; i < num_elements_to_print; ++i) {
             fprintf(stderr, "%d : %f\n", i, ggml_fp16_to_fp32(((ggml_fp16_t*)data)[i]));
         }
-    } else { //Quantized data
+    } else {
+
+        //FIX ME: this doesn't work correctly
         // dequantised data
         dequantized_data = (float*)malloc(num_elements_to_print * sizeof(float));
 
@@ -2843,13 +2845,9 @@ int llama_model_quantize(
     }
 }
 
-int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) {
-    int64_t t_lora_cache_us = ggml_time_us();
-
+int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove) {
     auto & model = ctx->model;
 
-    //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
-
     if (ctx->lora_cache.find(path_lora_to_apply) == ctx->lora_cache.end()) {
         fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora_to_apply);
         return 1;
@@ -2890,9 +2888,7 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
     bool warned = false;
     int n_tensors = 0;
 
-    //TODO: Fix this to process loraA and loraB seperately
     for (auto it = model_tensors.begin(); it != model_tensors.end(); ++it) {
-        t_lora_cache_us = ggml_time_us();
         const std::string& base_name = it->first;
         ggml_tensor * dest_t = it->second;
 
@@ -2901,7 +2897,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
         ggml_tensor* BA_remove;
 
 
-        bool lora_found = false;
+        bool lora_to_apply_found = false;
+        bool lora_to_remove_found = false;
+
         // check if we have both A and B tensors and apply
         if (cached_lora_adapter_apply->loraA_weights.find(base_name) != cached_lora_adapter_apply->loraA_weights.end() ||
             cached_lora_adapter_apply->loraB_weights.find(base_name) != cached_lora_adapter_apply->loraB_weights.end()) {
@@ -2935,15 +2933,12 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
             // BA*s
             BA_apply = ggml_mul_mat(lora_ctx, loraA, loraB);
 
-            //print time taken till now
-            // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
             if (scaling != 1.0f) {
                 ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
                 BA_apply = ggml_scale_inplace(lora_ctx, BA_apply, scale_tensor);
             }
 
-            lora_found = true;
+            lora_to_apply_found = true;
         }
 
         // check if we have both A and B tensors and remove
@@ -2979,43 +2974,24 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
             // BA*s
             BA_remove = ggml_mul_mat(lora_ctx, loraA, loraB);
 
-            //print time taken till now
-            // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
             scaling = -1.0f * scaling;
             if (scaling != 1.0f) {
                 ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
                 BA_remove = ggml_scale_inplace(lora_ctx, BA_remove, scale_tensor);
             }
 
-            // BA_remove = ggml_neg(lora_ctx, BA_remove); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
-            lora_found = true;
+            lora_to_remove_found = true;
         }
 
         
-        if (!lora_found) {
-            // fprintf(stderr, "%s: error: No lora tensors found for layer '%s'\n", __func__, base_name.c_str());
+        if (!lora_to_apply_found && !lora_to_remove_found) {
             continue;
         }
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-        
-        // w = w - BAs to unload the model
-        // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
-        // obvious improvement would be just caching the weights on load and using them on unload
-
-        //print time taken till now
-        // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
-        
-
-         //print time taken till now
-        // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
         ggml_tensor* BA;
-        if (BA_apply && BA_remove) {
+        if (lora_to_apply_found && lora_to_remove_found) {
             BA = ggml_add(lora_ctx, BA_apply, BA_remove);
-        } else if (BA_apply) {
+        } else if (lora_to_apply_found) {
             BA = BA_apply;
         } else {
             BA = BA_remove;
@@ -3025,15 +3001,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
 
         struct ggml_cgraph gf = ggml_build_forward(r);
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
         gf.n_threads = n_threads;
         ggml_graph_compute(lora_ctx, &gf);
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-    
         // we won't need these tensors again, reset the context to save memory
         ggml_free(lora_ctx);
         lora_ctx = ggml_init(params);
@@ -3042,14 +3012,6 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
         if (n_tensors % 4 == 0) {
             fprintf(stderr, ".");
         }
-
-        if (base_name == "layers.0.attention.wk.weight") {
-            // print_tensor(r);
-            print_ggml_float_tensor(dest_t, "TEST", "TEST", 10);
-        }
-
-        //print time taken till now
-        // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
     }
 
     // TODO: this should be in a destructor, it will leak on failure
@@ -3061,7 +3023,6 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
 
     ctx->loaded_loras[path_lora_to_apply] = true;
 
-    //TODO: Decide if I want to clear up the lora cache or not
 
     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
     fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
@@ -3069,11 +3030,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char *
     return 0;
 }
 
-// Set `remove_existing` to true if you want to remove the adapter from the model.
-int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) {
-    int64_t t_lora_cache_us = ggml_time_us();
-
-    if (remove_existing) {
+// Set `is_delete` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool is_delete) {
+    if (is_delete) {
         fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__);
     } else {
         fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__);
@@ -3081,8 +3040,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 
     auto & model = ctx->model;
 
-    //std::unordered_map<std::string, vector<float>> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors;
-
     if (ctx->lora_cache.find(path_lora) == ctx->lora_cache.end()) {
         fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora);
         return 1;
@@ -3108,9 +3065,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         model_tensors.insert(kv);
     }
 
-    //print time taken till now
-    //fprintf(stderr, "%s: time taken till copying base model weights = %8.2f ms\n", __func__, (ggml_time_us() - t_lora_cache_us)/1000.0);
-
     // load base model
     std::unique_ptr<llama_model_loader> model_loader;
     ggml_context * base_ctx = NULL;
@@ -3147,7 +3101,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 
     //TODO: Fix this to process loraA and loraB seperately
     for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) {
-        t_lora_cache_us = ggml_time_us();
         const std::string& base_name = it->first;
         const lora_metadata* metadata = it->second;
         std::vector<float> loraA_vec = cached_lora_adapter->loraA_weights[base_name];
@@ -3197,26 +3150,17 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         // BA*s
         ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
-        if (scaling != 1.0f) {
-            ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-            BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
-        }
-
-         //print time taken till now
-        // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-        
         // w = w - BAs to unload the model
         // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
         // obvious improvement would be just caching the weights on load and using them on unload
-        if (remove_existing) {
-            BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
+        if (is_delete) {
+            scaling = -1.0f * scaling;
         }
 
-        //print time taken till now
-        // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
+        if (scaling != 1.0f) {
+            ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+            BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
+        }
 
         ggml_tensor * r;
         if (base_t == dest_t) {
@@ -3227,20 +3171,11 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
             r = ggml_cpy(lora_ctx, r, dest_t);
         }
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
         struct ggml_cgraph gf = ggml_build_forward(r);
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-
         gf.n_threads = n_threads;
         ggml_graph_compute(lora_ctx, &gf);
 
-         //print time taken till now
-        // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
-    
         // we won't need these tensors again, reset the context to save memory
         ggml_free(lora_ctx);
         lora_ctx = ggml_init(params);
@@ -3249,9 +3184,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         if (n_tensors % 4 == 0) {
             fprintf(stderr, ".");
         }
-
-        //print time taken till now
-        // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0);
     }
 
     // TODO: this should be in a destructor, it will leak on failure
@@ -3260,13 +3192,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
         ggml_free(base_ctx);
     }
 
-    if (remove_existing) {
+    if (is_delete) {
         ctx->loaded_loras.erase(path_lora);
     } else {
         ctx->loaded_loras[path_lora] = true;
     }
 
-    //TODO: Decide if I want to clear up the lora cache or not
 
     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
     fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
@@ -3275,9 +3206,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char
 }
 
 
-// Set `remove_existing` to true if you want to remove the adapter from the model.
-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) {
-    if (remove_existing) {
+// Set `is_delete` to true if you want to remove the adapter from the model.
+int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool is_delete) {
+    if (is_delete) {
         fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__);
     } else {
         fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -3489,18 +3420,18 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             // BA*s
             ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
-            if (scaling != 1.0f) {
-                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-                BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
-            }
-
             // w = w - BAs to unload the model
             // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model
             // obvious improvement would be just caching the weights on load and using them on unload
-            if (remove_existing) {
+            if (is_delete) {
                 BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h
             }
 
+            if (scaling != 1.0f) {
+                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
+            }
+
             ggml_tensor * r;
             if (base_t == dest_t) {
                 r = ggml_add_inplace(lora_ctx, dest_t, BA);
@@ -3514,7 +3445,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
-            if (!remove_existing) {
+            if (!is_delete) {
                 // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart
                 
                 float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now
@@ -3551,7 +3482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
         ggml_free(base_ctx);
     }
 
-    if (remove_existing) {
+    if (is_delete) {
         ctx->loaded_loras.erase(path_lora);
     } else {
         ctx->loaded_loras[path_lora] = true;
@@ -3601,9 +3532,9 @@ int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_l
     }
 }
 
-int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) {
+int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove) {
     try {
-        return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, path_base_model, n_threads, path_lora_to_remove);
+        return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, n_threads, path_lora_to_remove);
     } catch (const std::exception & err) {
         fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what());
         return 1;
diff --git a/llama.h b/llama.h
index 9a27d59bd62dc..7b3c2b0b27152 100644
--- a/llama.h
+++ b/llama.h
@@ -186,7 +186,7 @@ extern "C" {
                     const char * path_base_model,
                             int   n_threads);
     
-    LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove);
+    LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove);
 
 
     // Returns the number of tokens in the KV cache