From 591b1c17b57545e2ac342d3761727fb7247a57bd Mon Sep 17 00:00:00 2001 From: TDM Date: Sat, 17 Jun 2023 15:12:27 +0530 Subject: [PATCH 1/5] WIP: Add ability to remove loras and save them in the context cache --- examples/addon.node/addon.cpp | 5 +- llama.cpp | 274 +++++++++++++++++++++++++++++++++- llama.h | 21 +++ 3 files changed, 295 insertions(+), 5 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index cae06cc9e29e0..00e19d677d16a 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -61,7 +61,10 @@ Napi::Number swapLora(const Napi::CallbackInfo &info) fprintf(stderr, "Acquiring lock\n"); worker_mutex.lock(); - fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str()); + fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str()); + llama_remove_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); + + fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str()); llama_apply_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); worker_mutex.unlock(); diff --git a/llama.cpp b/llama.cpp index 4a7d01b3297b2..71ae271f117dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -52,6 +52,9 @@ #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 +#include +#include + // available llama models enum e_model { MODEL_UNKNOWN, @@ -278,6 +281,13 @@ struct llama_context { llama_ctx_buffer buf_compute; llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + // cache lora adapter weights, specifically s*BA matmul + // key -> base model layer name, value -> s*BA + // directly caching the matmul to avoid calculations at runtime + // for now only caches the last loaded adapter weights and assumes they'll be overwritten by next call + // TODO: Free these weights on deconstructor + std::unordered_map> adapter_weights; + #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; #endif @@ -873,6 +883,54 @@ struct llama_model_loader { }; +const double epsilon = 1e-4; + +bool isNotZeroFloat(float val) { + return std::fabs(val) > epsilon; +} + +void print_ggml_float_tensor(const struct ggml_tensor * tensor, std::string base_name, std::string metadata, int max_elements_to_print) { + fprintf(stderr, "Layer Name: %s, Metadata: %s\n", base_name.c_str(), metadata.c_str()); + + fprintf(stderr, "Tensor Type %d\n", tensor->type); + int64_t num_elements = ggml_nelements(tensor); + + fprintf(stderr, "Data Length Num Elements %ld\n", num_elements); + fprintf(stderr, "Data Length Bytes %ld\n", ggml_nbytes(tensor)); + + void* data; + float* dequantized_data; + dequantize_row_q_t dequantize_row_fn; + + int64_t num_elements_to_print = max_elements_to_print > 0 ? max_elements_to_print : num_elements; + + if (tensor->type == 0) { // F32 + data = reinterpret_cast(tensor->data); + for (int i = 0; i < num_elements_to_print; ++i) { + fprintf(stderr, "%d : %f\n", i, ((float*)data)[i]); + } + } else if (tensor->type == 1) { // F16 + data = reinterpret_cast(tensor->data); + for (int i = 0; i < num_elements_to_print; ++i) { + fprintf(stderr, "%d : %f\n", i, ggml_fp16_to_fp32(((ggml_fp16_t*)data)[i])); + } + } else { //Quantized data + // dequantised data + dequantized_data = (float*)malloc(num_elements_to_print * sizeof(float)); + + dequantize_row_fn = ggml_internal_get_quantize_fn(tensor->type).dequantize_row_q; + if (dequantize_row_fn == nullptr) { + fprintf(stderr, "Quantization type %d not supported for print tensors\n", tensor->type); + return; + } + dequantize_row_fn(tensor->data, dequantized_data, num_elements_to_print); + for (int i = 0; i < num_elements_to_print; ++i) { + fprintf(stderr, "%d : %f\n", i, dequantized_data[i]); + } + } + + fprintf(stderr, "\n"); +} // // kv cache @@ -2765,8 +2823,168 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { - fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); +// Set `deactivate_adapter` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unordered_map& cached_tensors, const char * path_base_model, int n_threads, const bool deactivate_adapter) { + if (deactivate_adapter) { + fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__); + } else { + fprintf(stderr, "%s: applying lora adapter - please wait ...\n", __func__); + } + + auto & model = ctx->model; + + //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; + + std::unordered_map> lora_tensors = ctx->adapter_weights; + + if (lora_tensors.size() == 0) { + fprintf(stderr, "%s: no tensors to apply\n", __func__); + return 0; + } + + const int64_t t_start_lora_us = ggml_time_us(); + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_buffer base_buf; + if (path_base_model) { + fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + + size_t ctx_size; + size_t mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + + for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) { + const std::string& base_name = it->first; + struct std::vector BA_vector = it->second; + ggml_tensor* BA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, BA_vector.size() / 4096, 4096); + BA->data = (float*) BA_vector.data(); + + // check if we have both A and B tensors and apply + if (model_tensors.find(base_name) == model_tensors.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in model\n", __func__, base_name.c_str()); + continue; + } + + ggml_tensor * dest_t = model_tensors[base_name]; + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + // w = w - BAs to unload the model + // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model + // obvious improvement would be just caching the weights on load and using them on unload + if (deactivate_adapter) { + BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + r = ggml_cpy(lora_ctx, r, dest_t); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + + n_tensors++; + if (n_tensors % 4 == 0) { + fprintf(stderr, "."); + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + //TODO: Decide if I want to clear up the lora cache or not + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + + +// Set `deactivate_adapter` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool deactivate_adapter) { + if (deactivate_adapter) { + fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__); + } else { + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + } auto & model = ctx->model; @@ -2853,6 +3071,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // read tensors and apply bool warned = false; int n_tensors = 0; + while (true) { int32_t n_dims; int32_t length; @@ -2965,7 +3184,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * return 1; } - // w = w + BA*s + // BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); if (scaling != 1.0f) { @@ -2973,6 +3192,13 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); } + // w = w - BAs to unload the model + // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model + // obvious improvement would be just caching the weights on load and using them on unload + if (deactivate_adapter) { + BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h + } + ggml_tensor * r; if (base_t == dest_t) { r = ggml_add_inplace(lora_ctx, dest_t, BA); @@ -2986,6 +3212,19 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); + if (!deactivate_adapter) { + const int64_t t_copy_lora_us = ggml_time_us(); + + // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart + float* BA_data = (float *) BA->data; // can do this cause BA is guaranteed to be F32 for now + + std::vector BA_data_copy(BA_data, BA_data + ggml_nelements(BA)); + ctx->adapter_weights[base_name] = BA_data_copy; + + const int64_t t_copy_lora_us_end = ggml_time_us(); + + fprintf(stderr, "%s: copied lora tensor '%s' in %.2f ms\n", __func__, base_name.c_str(), (t_copy_lora_us_end - t_copy_lora_us) / 1000.0); + } // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params); @@ -3012,7 +3251,34 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, false); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting deactivate_adapter to true + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to remove lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_apply_lora_from_cache(struct llama_context * ctx, std::unordered_map lora_cache, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, false); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_remove_lora_from_cache(struct llama_context * ctx, std::unordered_map lora_cache, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, true); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; diff --git a/llama.h b/llama.h index 1241ba6c0ec44..e38cd0c732814 100644 --- a/llama.h +++ b/llama.h @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -164,6 +167,24 @@ extern "C" { const char * path_lora, const char * path_base_model, int n_threads); + + LLAMA_API int llama_remove_lora_from_file( + struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); + + LLAMA_API int llama_apply_lora_from_cache( + struct llama_context * ctx, + std::unordered_map< std::string, ggml_tensor* > lora_cache, + const char * path_base_model, + int n_threads); + + LLAMA_API int llama_remove_lora_from_cache( + struct llama_context * ctx, + std::unordered_map< std::string, ggml_tensor* > lora_cache, + const char * path_base_model, + int n_threads); // Returns the number of tokens in the KV cache LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); From 9f7ec8f238ba1a0581f5b3f3a456bb0d75e23f11 Mon Sep 17 00:00:00 2001 From: TDM Date: Sat, 17 Jun 2023 17:57:44 +0530 Subject: [PATCH 2/5] Cache individual adapters instead of matmul result --- llama.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 71ae271f117dd..0eb39abbd9992 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2893,6 +2893,7 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde bool warned = false; int n_tensors = 0; + //TODO: Fix this to process loraA and loraB seperately for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) { const std::string& base_name = it->first; struct std::vector BA_vector = it->second; @@ -3212,19 +3213,19 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); + //TODO: Should use a lora_cache struct where we can also store matrix dimensions & other metadata that are required for calculations once load if (!deactivate_adapter) { - const int64_t t_copy_lora_us = ggml_time_us(); - // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart - float* BA_data = (float *) BA->data; // can do this cause BA is guaranteed to be F32 for now + float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now - std::vector BA_data_copy(BA_data, BA_data + ggml_nelements(BA)); - ctx->adapter_weights[base_name] = BA_data_copy; - - const int64_t t_copy_lora_us_end = ggml_time_us(); + std::vector loraA_data_copy(loraA_data, loraA_data + ggml_nelements(loraA)); + ctx->adapter_weights[base_name + ".loraA"] = loraA_data_copy; - fprintf(stderr, "%s: copied lora tensor '%s' in %.2f ms\n", __func__, base_name.c_str(), (t_copy_lora_us_end - t_copy_lora_us) / 1000.0); + float* loraB_data = (float *) loraB->data; // can do this cause lora is guaranteed to be F32 for now + std::vector loraB_data_copy(loraB_data, loraB_data + ggml_nelements(loraB)); + ctx->adapter_weights[base_name + ".loraB"] = loraB_data_copy; } + // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params); From edf7d285b550881c171438efb87fecb18b4348e4 Mon Sep 17 00:00:00 2001 From: TDM Date: Sat, 17 Jun 2023 20:49:24 +0530 Subject: [PATCH 3/5] Cache metadata of individual adapaters and change the load method to use cache --- examples/addon.node/addon.cpp | 4 +- llama.cpp | 114 ++++++++++++++++++++++++++-------- llama.h | 4 +- 3 files changed, 92 insertions(+), 30 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 00e19d677d16a..ac3f115f58e4b 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -62,10 +62,10 @@ Napi::Number swapLora(const Napi::CallbackInfo &info) worker_mutex.lock(); fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str()); - llama_remove_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); + llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str()); - llama_apply_lora_from_file(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); + llama_apply_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); worker_mutex.unlock(); return Napi::Number::New(info.Env(), 0); diff --git a/llama.cpp b/llama.cpp index 0eb39abbd9992..c3e2aad895148 100644 --- a/llama.cpp +++ b/llama.cpp @@ -249,6 +249,26 @@ struct llama_vocab { std::vector id_to_token; }; +struct lora_metadata { + std::string name; + int64_t ne[2]; + ggml_type type; +}; + +// TODO: move outside llama to addon.cpp for easy merge in the future +// contains layer name to weight mapping +struct lora_adapter_weights_map { + int32_t lora_alpha; + int32_t lora_r; + float scaling; + + std::unordered_map lora_metadata_map; + std::unordered_map > loraA_weights; + std::unordered_map > loraB_weights; + // strings are being kept thrice, mem usage can be reduced further using a single map +}; + + struct llama_context { std::mt19937 rng; @@ -282,11 +302,10 @@ struct llama_context { llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; // cache lora adapter weights, specifically s*BA matmul - // key -> base model layer name, value -> s*BA - // directly caching the matmul to avoid calculations at runtime - // for now only caches the last loaded adapter weights and assumes they'll be overwritten by next call + // key -> lora model path // TODO: Free these weights on deconstructor - std::unordered_map> adapter_weights; + std::unordered_map lora_cache; + #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; @@ -2824,23 +2843,23 @@ int llama_model_quantize( } // Set `deactivate_adapter` to true if you want to remove the adapter from the model. -int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unordered_map& cached_tensors, const char * path_base_model, int n_threads, const bool deactivate_adapter) { +int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * lora_path, const char * path_base_model, int n_threads, const bool deactivate_adapter) { if (deactivate_adapter) { - fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__); + fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__); } else { - fprintf(stderr, "%s: applying lora adapter - please wait ...\n", __func__); + fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__); } auto & model = ctx->model; //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; - std::unordered_map> lora_tensors = ctx->adapter_weights; - - if (lora_tensors.size() == 0) { - fprintf(stderr, "%s: no tensors to apply\n", __func__); - return 0; + if (ctx->lora_cache.find(lora_path) == ctx->lora_cache.end()) { + fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, lora_path); + return 1; } + + lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[lora_path]; const int64_t t_start_lora_us = ggml_time_us(); @@ -2893,12 +2912,22 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde bool warned = false; int n_tensors = 0; + float scaling = cached_lora_adapter->scaling; + //TODO: Fix this to process loraA and loraB seperately - for (auto it = lora_tensors.begin(); it != lora_tensors.end(); ++it) { + for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) { const std::string& base_name = it->first; - struct std::vector BA_vector = it->second; - ggml_tensor* BA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, BA_vector.size() / 4096, 4096); - BA->data = (float*) BA_vector.data(); + const lora_metadata* metadata = it->second; + std::vector loraA_vec = cached_lora_adapter->loraA_weights[base_name]; + std::vector loraB_vec = cached_lora_adapter->loraB_weights[base_name]; + int ne0 = metadata->ne[0]; + int ne1 = metadata->ne[1]; + + ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32 + ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); + + loraA->data = (float*) loraA_vec.data(); + loraB->data = (float*) loraB_vec.data(); // check if we have both A and B tensors and apply if (model_tensors.find(base_name) == model_tensors.end()) { @@ -2933,6 +2962,15 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde } } + // BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + } + + // w = w - BAs to unload the model // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model // obvious improvement would be just caching the weights on load and using them on unload @@ -2953,6 +2991,14 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, std::unorde gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); + if (base_name == "layers.0.attention.wk.weight") { + std::string print_metadata = "Applying lora from cache"; + if (deactivate_adapter) { + print_metadata = "Deactivating lora from cache"; + } + print_ggml_float_tensor(dest_t, "layers.0.attention.wk.weight", print_metadata, 10); + + } // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3073,6 +3119,11 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * bool warned = false; int n_tensors = 0; + lora_adapter_weights_map* lora_adapter_weights = new lora_adapter_weights_map(); + lora_adapter_weights->lora_alpha = lora_alpha; + lora_adapter_weights->lora_r = lora_r; + lora_adapter_weights->scaling = scaling; + while (true) { int32_t n_dims; int32_t length; @@ -3213,17 +3264,23 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - //TODO: Should use a lora_cache struct where we can also store matrix dimensions & other metadata that are required for calculations once load if (!deactivate_adapter) { // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart - float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now + float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now std::vector loraA_data_copy(loraA_data, loraA_data + ggml_nelements(loraA)); - ctx->adapter_weights[base_name + ".loraA"] = loraA_data_copy; float* loraB_data = (float *) loraB->data; // can do this cause lora is guaranteed to be F32 for now std::vector loraB_data_copy(loraB_data, loraB_data + ggml_nelements(loraB)); - ctx->adapter_weights[base_name + ".loraB"] = loraB_data_copy; + + lora_metadata* lora_layer_metadata = new lora_metadata(); + lora_layer_metadata->ne[0] = loraA->ne[0]; + lora_layer_metadata->ne[1] = loraA->ne[1]; + lora_layer_metadata->type = loraA->type; + + lora_adapter_weights->loraA_weights[base_name] = loraA_data_copy; + lora_adapter_weights->loraB_weights[base_name] = loraB_data_copy; + lora_adapter_weights->lora_metadata_map[base_name] = lora_layer_metadata; } // we won't need these tensors again, reset the context to save memory @@ -3244,6 +3301,10 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * ggml_free(base_ctx); } + if (!deactivate_adapter) { + ctx->lora_cache[path_lora] = lora_adapter_weights; + } + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); @@ -3268,20 +3329,21 @@ int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lo } } -int llama_apply_lora_from_cache(struct llama_context * ctx, std::unordered_map lora_cache, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_cache(struct llama_context * ctx, const char * path_lora ,const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, false); + return llama_apply_lora_from_cache_internal(ctx, path_lora, path_base_model, n_threads, false); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + fprintf(stderr, "%s: failed to apply cached lora adapter: %s\n", __func__, err.what()); return 1; } } -int llama_remove_lora_from_cache(struct llama_context * ctx, std::unordered_map lora_cache, const char * path_base_model, int n_threads) { + +int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_cache_internal(ctx, lora_cache, path_base_model, n_threads, true); + return llama_apply_lora_from_cache_internal(ctx, path_lora, path_base_model, n_threads, true); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what()); return 1; } } diff --git a/llama.h b/llama.h index e38cd0c732814..d0d53d1a91cdc 100644 --- a/llama.h +++ b/llama.h @@ -176,13 +176,13 @@ extern "C" { LLAMA_API int llama_apply_lora_from_cache( struct llama_context * ctx, - std::unordered_map< std::string, ggml_tensor* > lora_cache, + const char * path_lora, const char * path_base_model, int n_threads); LLAMA_API int llama_remove_lora_from_cache( struct llama_context * ctx, - std::unordered_map< std::string, ggml_tensor* > lora_cache, + const char * path_lora, const char * path_base_model, int n_threads); From 1309f2504323e3f386a714d97adeba7a009bbff0 Mon Sep 17 00:00:00 2001 From: TDM Date: Mon, 19 Jun 2023 23:19:25 +0530 Subject: [PATCH 4/5] Add ability to swap lora in a single method by combining graph calculations --- examples/addon.node/addon.cpp | 8 +- llama.cpp | 307 +++++++++++++++++++++++++++++++--- llama.h | 3 + 3 files changed, 292 insertions(+), 26 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index ac3f115f58e4b..3e8580e4fc031 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -61,11 +61,11 @@ Napi::Number swapLora(const Napi::CallbackInfo &info) fprintf(stderr, "Acquiring lock\n"); worker_mutex.lock(); - fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str()); - llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); + // fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str()); + // llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); - fprintf(stderr, "Applying lora from Path: %s\n", lora.c_str()); - llama_apply_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); + fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str()); + llama_swap_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores(), lora.c_str()); worker_mutex.unlock(); return Napi::Number::New(info.Env(), 0); diff --git a/llama.cpp b/llama.cpp index c3e2aad895148..08c081e27ce6e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -305,6 +305,7 @@ struct llama_context { // key -> lora model path // TODO: Free these weights on deconstructor std::unordered_map lora_cache; + std::unordered_map loaded_loras; #ifdef GGML_USE_METAL @@ -2842,9 +2843,237 @@ int llama_model_quantize( } } -// Set `deactivate_adapter` to true if you want to remove the adapter from the model. -int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * lora_path, const char * path_base_model, int n_threads, const bool deactivate_adapter) { - if (deactivate_adapter) { +int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) { + int64_t t_lora_cache_us = ggml_time_us(); + + auto & model = ctx->model; + + //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; + + if (ctx->lora_cache.find(path_lora_to_apply) == ctx->lora_cache.end()) { + fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora_to_apply); + return 1; + } + + lora_adapter_weights_map* cached_lora_adapter_apply = ctx->lora_cache[path_lora_to_apply]; + lora_adapter_weights_map* cached_lora_adapter_remove = ctx->lora_cache[path_lora_to_remove]; + + if (cached_lora_adapter_apply->loraA_weights.size() == 0) { + fprintf(stderr, "%s: error: cached lora '%s' is empty\n", __func__, path_lora_to_apply); + return 1; + } + + if (cached_lora_adapter_remove->loraA_weights.size() == 0) { + fprintf(stderr, "%s: error: cached lora '%s' is empty\n", __func__, path_lora_to_remove); + return 1; + } + + const int64_t t_start_lora_us = ggml_time_us(); + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + + //TODO: Fix this to process loraA and loraB seperately + for (auto it = model_tensors.begin(); it != model_tensors.end(); ++it) { + t_lora_cache_us = ggml_time_us(); + const std::string& base_name = it->first; + ggml_tensor * dest_t = it->second; + + ggml_tensor * r; + ggml_tensor* BA_apply; + ggml_tensor* BA_remove; + + + bool lora_found = false; + // check if we have both A and B tensors and apply + if (cached_lora_adapter_apply->loraA_weights.find(base_name) != cached_lora_adapter_apply->loraA_weights.end() || + cached_lora_adapter_apply->loraB_weights.find(base_name) != cached_lora_adapter_apply->loraB_weights.end()) { + std::vector loraA_vec = cached_lora_adapter_apply->loraA_weights[base_name]; + std::vector loraB_vec = cached_lora_adapter_apply->loraB_weights[base_name]; + float scaling = cached_lora_adapter_apply->scaling; + + lora_metadata* metadata = cached_lora_adapter_apply->lora_metadata_map[base_name]; + + int ne0 = metadata->ne[0]; + int ne1 = metadata->ne[1]; + + ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32 + ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); + + loraA->data = (float*) loraA_vec.data(); + loraB->data = (float*) loraB_vec.data(); + + ggml_tensor * base_t; + + base_t = dest_t; + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + // BA*s + BA_apply = ggml_mul_mat(lora_ctx, loraA, loraB); + + //print time taken till now + // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA_apply = ggml_scale_inplace(lora_ctx, BA_apply, scale_tensor); + } + + lora_found = true; + } + + // check if we have both A and B tensors and remove + if (cached_lora_adapter_remove->loraA_weights.find(base_name) != cached_lora_adapter_remove->loraA_weights.end() || + cached_lora_adapter_remove->loraB_weights.find(base_name) != cached_lora_adapter_remove->loraB_weights.end()) { + std::vector loraA_vec = cached_lora_adapter_remove->loraA_weights[base_name]; + std::vector loraB_vec = cached_lora_adapter_remove->loraB_weights[base_name]; + float scaling = cached_lora_adapter_remove->scaling; + + lora_metadata* metadata = cached_lora_adapter_remove->lora_metadata_map[base_name]; + + int ne0 = metadata->ne[0]; + int ne1 = metadata->ne[1]; + + ggml_tensor* loraA = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); // for now it's fine since lora calculations are always in F32 + ggml_tensor* loraB = ggml_new_tensor_2d(lora_ctx, GGML_TYPE_F32, ne0, ne1); + + loraA->data = (float*) loraA_vec.data(); + loraB->data = (float*) loraB_vec.data(); + + ggml_tensor * base_t; + + base_t = dest_t; + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + // BA*s + BA_remove = ggml_mul_mat(lora_ctx, loraA, loraB); + + //print time taken till now + // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + scaling = -1.0f * scaling; + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA_remove = ggml_scale_inplace(lora_ctx, BA_remove, scale_tensor); + } + + // BA_remove = ggml_neg(lora_ctx, BA_remove); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h + lora_found = true; + } + + + if (!lora_found) { + // fprintf(stderr, "%s: error: No lora tensors found for layer '%s'\n", __func__, base_name.c_str()); + continue; + } + + //print time taken till now + // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + // w = w - BAs to unload the model + // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model + // obvious improvement would be just caching the weights on load and using them on unload + + //print time taken till now + // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + + + //print time taken till now + // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + ggml_tensor* BA; + if (BA_apply && BA_remove) { + BA = ggml_add(lora_ctx, BA_apply, BA_remove); + } else if (BA_apply) { + BA = BA_apply; + } else { + BA = BA_remove; + } + + r = ggml_add(lora_ctx, dest_t, BA); + + struct ggml_cgraph gf = ggml_build_forward(r); + + //print time taken till now + // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + //print time taken till now + // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + + n_tensors++; + if (n_tensors % 4 == 0) { + fprintf(stderr, "."); + } + + if (base_name == "layers.0.attention.wk.weight") { + // print_tensor(r); + print_ggml_float_tensor(dest_t, "TEST", "TEST", 10); + } + + //print time taken till now + // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + + if (path_lora_to_remove) { + ctx->loaded_loras.erase(path_lora_to_remove); + } + + ctx->loaded_loras[path_lora_to_apply] = true; + + //TODO: Decide if I want to clear up the lora cache or not + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +// Set `remove_existing` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) { + int64_t t_lora_cache_us = ggml_time_us(); + + if (remove_existing) { fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__); } else { fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__); @@ -2854,12 +3083,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; - if (ctx->lora_cache.find(lora_path) == ctx->lora_cache.end()) { - fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, lora_path); + if (ctx->lora_cache.find(path_lora) == ctx->lora_cache.end()) { + fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora); return 1; } - lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[lora_path]; + lora_adapter_weights_map* cached_lora_adapter = ctx->lora_cache[path_lora]; const int64_t t_start_lora_us = ggml_time_us(); @@ -2879,6 +3108,8 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char model_tensors.insert(kv); } + //print time taken till now + //fprintf(stderr, "%s: time taken till copying base model weights = %8.2f ms\n", __func__, (ggml_time_us() - t_lora_cache_us)/1000.0); // load base model std::unique_ptr model_loader; @@ -2916,6 +3147,7 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char //TODO: Fix this to process loraA and loraB seperately for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) { + t_lora_cache_us = ggml_time_us(); const std::string& base_name = it->first; const lora_metadata* metadata = it->second; std::vector loraA_vec = cached_lora_adapter->loraA_weights[base_name]; @@ -2965,19 +3197,27 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char // BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + //print time taken till now + // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); } + //print time taken till now + // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); // w = w - BAs to unload the model // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model // obvious improvement would be just caching the weights on load and using them on unload - if (deactivate_adapter) { + if (remove_existing) { BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h } + //print time taken till now + // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + ggml_tensor * r; if (base_t == dest_t) { r = ggml_add_inplace(lora_ctx, dest_t, BA); @@ -2987,18 +3227,19 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char r = ggml_cpy(lora_ctx, r, dest_t); } + //print time taken till now + // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + struct ggml_cgraph gf = ggml_build_forward(r); + + //print time taken till now + // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - if (base_name == "layers.0.attention.wk.weight") { - std::string print_metadata = "Applying lora from cache"; - if (deactivate_adapter) { - print_metadata = "Deactivating lora from cache"; - } - print_ggml_float_tensor(dest_t, "layers.0.attention.wk.weight", print_metadata, 10); - - } + //print time taken till now + // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3008,6 +3249,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char if (n_tensors % 4 == 0) { fprintf(stderr, "."); } + + //print time taken till now + // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); } // TODO: this should be in a destructor, it will leak on failure @@ -3016,6 +3260,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char ggml_free(base_ctx); } + if (remove_existing) { + ctx->loaded_loras.erase(path_lora); + } else { + ctx->loaded_loras[path_lora] = true; + } + //TODO: Decide if I want to clear up the lora cache or not const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; @@ -3025,9 +3275,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char } -// Set `deactivate_adapter` to true if you want to remove the adapter from the model. -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool deactivate_adapter) { - if (deactivate_adapter) { +// Set `remove_existing` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) { + if (remove_existing) { fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__); } else { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -3247,7 +3497,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // w = w - BAs to unload the model // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model // obvious improvement would be just caching the weights on load and using them on unload - if (deactivate_adapter) { + if (remove_existing) { BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h } @@ -3264,7 +3514,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - if (!deactivate_adapter) { + if (!remove_existing) { // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now @@ -3301,7 +3551,10 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * ggml_free(base_ctx); } - if (!deactivate_adapter) { + if (remove_existing) { + ctx->loaded_loras.erase(path_lora); + } else { + ctx->loaded_loras[path_lora] = true; ctx->lora_cache[path_lora] = lora_adapter_weights; } @@ -3322,7 +3575,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor int llama_remove_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting deactivate_adapter to true + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads, true); // deactivate adapter by setting remove_existing to true } catch (const std::exception & err) { fprintf(stderr, "%s: failed to remove lora adapter: %s\n", __func__, err.what()); return 1; @@ -3348,6 +3601,16 @@ int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_l } } +int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) { + try { + return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, path_base_model, n_threads, path_lora_to_remove); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + + int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->model.kv_self.n; } diff --git a/llama.h b/llama.h index d0d53d1a91cdc..9a27d59bd62dc 100644 --- a/llama.h +++ b/llama.h @@ -185,6 +185,9 @@ extern "C" { const char * path_lora, const char * path_base_model, int n_threads); + + LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove); + // Returns the number of tokens in the KV cache LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); From 9ab1248c7d24fc4013a6fb3474757a789ef08a97 Mon Sep 17 00:00:00 2001 From: TDM Date: Tue, 20 Jun 2023 20:25:37 +0530 Subject: [PATCH 5/5] Improve lora swap times by 300ms by removing neg ops, plus code cleanup --- examples/addon.node/addon.cpp | 5 +- llama.cpp | 139 +++++++++------------------------- llama.h | 2 +- 3 files changed, 37 insertions(+), 109 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 3e8580e4fc031..1e4faf25473c3 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -61,11 +61,8 @@ Napi::Number swapLora(const Napi::CallbackInfo &info) fprintf(stderr, "Acquiring lock\n"); worker_mutex.lock(); - // fprintf(stderr, "Removing lora from Path: %s\n", lora.c_str()); - // llama_remove_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores()); - fprintf(stderr, "Swapping lora from Path: %s\n", lora.c_str()); - llama_swap_lora_from_cache(g_ctx, lora.c_str(), NULL, get_num_physical_cores(), lora.c_str()); + llama_swap_lora_from_cache(g_ctx, lora.c_str(), get_num_physical_cores(), lora.c_str()); worker_mutex.unlock(); return Napi::Number::New(info.Env(), 0); diff --git a/llama.cpp b/llama.cpp index 08c081e27ce6e..9cbdc1d7052e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -934,7 +934,9 @@ void print_ggml_float_tensor(const struct ggml_tensor * tensor, std::string base for (int i = 0; i < num_elements_to_print; ++i) { fprintf(stderr, "%d : %f\n", i, ggml_fp16_to_fp32(((ggml_fp16_t*)data)[i])); } - } else { //Quantized data + } else { + + //FIX ME: this doesn't work correctly // dequantised data dequantized_data = (float*)malloc(num_elements_to_print * sizeof(float)); @@ -2843,13 +2845,9 @@ int llama_model_quantize( } } -int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) { - int64_t t_lora_cache_us = ggml_time_us(); - +int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove) { auto & model = ctx->model; - //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; - if (ctx->lora_cache.find(path_lora_to_apply) == ctx->lora_cache.end()) { fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora_to_apply); return 1; @@ -2890,9 +2888,7 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * bool warned = false; int n_tensors = 0; - //TODO: Fix this to process loraA and loraB seperately for (auto it = model_tensors.begin(); it != model_tensors.end(); ++it) { - t_lora_cache_us = ggml_time_us(); const std::string& base_name = it->first; ggml_tensor * dest_t = it->second; @@ -2901,7 +2897,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * ggml_tensor* BA_remove; - bool lora_found = false; + bool lora_to_apply_found = false; + bool lora_to_remove_found = false; + // check if we have both A and B tensors and apply if (cached_lora_adapter_apply->loraA_weights.find(base_name) != cached_lora_adapter_apply->loraA_weights.end() || cached_lora_adapter_apply->loraB_weights.find(base_name) != cached_lora_adapter_apply->loraB_weights.end()) { @@ -2935,15 +2933,12 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * // BA*s BA_apply = ggml_mul_mat(lora_ctx, loraA, loraB); - //print time taken till now - // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); BA_apply = ggml_scale_inplace(lora_ctx, BA_apply, scale_tensor); } - lora_found = true; + lora_to_apply_found = true; } // check if we have both A and B tensors and remove @@ -2979,43 +2974,24 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * // BA*s BA_remove = ggml_mul_mat(lora_ctx, loraA, loraB); - //print time taken till now - // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - scaling = -1.0f * scaling; if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); BA_remove = ggml_scale_inplace(lora_ctx, BA_remove, scale_tensor); } - // BA_remove = ggml_neg(lora_ctx, BA_remove); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h - lora_found = true; + lora_to_remove_found = true; } - if (!lora_found) { - // fprintf(stderr, "%s: error: No lora tensors found for layer '%s'\n", __func__, base_name.c_str()); + if (!lora_to_apply_found && !lora_to_remove_found) { continue; } - //print time taken till now - // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - - // w = w - BAs to unload the model - // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model - // obvious improvement would be just caching the weights on load and using them on unload - - //print time taken till now - // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - - - - //print time taken till now - // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); ggml_tensor* BA; - if (BA_apply && BA_remove) { + if (lora_to_apply_found && lora_to_remove_found) { BA = ggml_add(lora_ctx, BA_apply, BA_remove); - } else if (BA_apply) { + } else if (lora_to_apply_found) { BA = BA_apply; } else { BA = BA_remove; @@ -3025,15 +3001,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * struct ggml_cgraph gf = ggml_build_forward(r); - //print time taken till now - // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - //print time taken till now - // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params); @@ -3042,14 +3012,6 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * if (n_tensors % 4 == 0) { fprintf(stderr, "."); } - - if (base_name == "layers.0.attention.wk.weight") { - // print_tensor(r); - print_ggml_float_tensor(dest_t, "TEST", "TEST", 10); - } - - //print time taken till now - // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); } // TODO: this should be in a destructor, it will leak on failure @@ -3061,7 +3023,6 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * ctx->loaded_loras[path_lora_to_apply] = true; - //TODO: Decide if I want to clear up the lora cache or not const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); @@ -3069,11 +3030,9 @@ int llama_swap_lora_from_cache_internal(struct llama_context * ctx, const char * return 0; } -// Set `remove_existing` to true if you want to remove the adapter from the model. -int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) { - int64_t t_lora_cache_us = ggml_time_us(); - - if (remove_existing) { +// Set `is_delete` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool is_delete) { + if (is_delete) { fprintf(stderr, "%s: deactivating lora adapter from cache - please wait ...\n", __func__); } else { fprintf(stderr, "%s: applying lora adapter from cache - please wait ...\n", __func__); @@ -3081,8 +3040,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char auto & model = ctx->model; - //std::unordered_map> lora_tensors = cached_tensors.size() == 0 ? ctx->adapter_weights : cached_tensors; - if (ctx->lora_cache.find(path_lora) == ctx->lora_cache.end()) { fprintf(stderr, "%s: error: cached lora '%s' not found\n", __func__, path_lora); return 1; @@ -3108,9 +3065,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char model_tensors.insert(kv); } - //print time taken till now - //fprintf(stderr, "%s: time taken till copying base model weights = %8.2f ms\n", __func__, (ggml_time_us() - t_lora_cache_us)/1000.0); - // load base model std::unique_ptr model_loader; ggml_context * base_ctx = NULL; @@ -3147,7 +3101,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char //TODO: Fix this to process loraA and loraB seperately for (auto it = cached_lora_adapter->lora_metadata_map.begin(); it != cached_lora_adapter->lora_metadata_map.end(); ++it) { - t_lora_cache_us = ggml_time_us(); const std::string& base_name = it->first; const lora_metadata* metadata = it->second; std::vector loraA_vec = cached_lora_adapter->loraA_weights[base_name]; @@ -3197,26 +3150,17 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char // BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - //print time taken till now - // fprintf(stderr, "time taken till matmul %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - - if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); - } - - //print time taken till now - // fprintf(stderr, "time taken till scaling %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - // w = w - BAs to unload the model // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model // obvious improvement would be just caching the weights on load and using them on unload - if (remove_existing) { - BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h + if (is_delete) { + scaling = -1.0f * scaling; } - //print time taken till now - // fprintf(stderr, "time taken till neg %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + } ggml_tensor * r; if (base_t == dest_t) { @@ -3227,20 +3171,11 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char r = ggml_cpy(lora_ctx, r, dest_t); } - //print time taken till now - // fprintf(stderr, "time taken till add inplace %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - struct ggml_cgraph gf = ggml_build_forward(r); - //print time taken till now - // fprintf(stderr, "time taken till graph build forward %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - //print time taken till now - // fprintf(stderr, "time taken till graph compute %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); - // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params); @@ -3249,9 +3184,6 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char if (n_tensors % 4 == 0) { fprintf(stderr, "."); } - - //print time taken till now - // fprintf(stderr, "time taken to process cached layer %s = %8.2f ms\n", base_name.c_str(), (ggml_time_us() - t_lora_cache_us)/1000.0); } // TODO: this should be in a destructor, it will leak on failure @@ -3260,13 +3192,12 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char ggml_free(base_ctx); } - if (remove_existing) { + if (is_delete) { ctx->loaded_loras.erase(path_lora); } else { ctx->loaded_loras[path_lora] = true; } - //TODO: Decide if I want to clear up the lora cache or not const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); @@ -3275,9 +3206,9 @@ int llama_apply_lora_from_cache_internal(struct llama_context * ctx, const char } -// Set `remove_existing` to true if you want to remove the adapter from the model. -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool remove_existing) { - if (remove_existing) { +// Set `is_delete` to true if you want to remove the adapter from the model. +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads, const bool is_delete) { + if (is_delete) { fprintf(stderr, "%s: deactivating lora adapter - please wait ...\n", __func__); } else { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -3489,18 +3420,18 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); - } - // w = w - BAs to unload the model // this has an obvious flaw that you need to pass exactly the same adapter weights with same layer names to unload the model // obvious improvement would be just caching the weights on load and using them on unload - if (remove_existing) { + if (is_delete) { BA = ggml_neg(lora_ctx, BA); // woould like to use inplace implementation but unfortunately its not exposed in ggml.h } + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + } + ggml_tensor * r; if (base_t == dest_t) { r = ggml_add_inplace(lora_ctx, dest_t, BA); @@ -3514,7 +3445,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - if (!remove_existing) { + if (!is_delete) { // The copying poses a runtime cost though, might need to find a faster way to do this, some kind of cache warmup at the time of process restart float* loraA_data = (float *) loraA->data; // can do this cause lora is guaranteed to be F32 for now @@ -3551,7 +3482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * ggml_free(base_ctx); } - if (remove_existing) { + if (is_delete) { ctx->loaded_loras.erase(path_lora); } else { ctx->loaded_loras[path_lora] = true; @@ -3601,9 +3532,9 @@ int llama_remove_lora_from_cache(struct llama_context * ctx, const char * path_l } } -int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove) { +int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove) { try { - return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, path_base_model, n_threads, path_lora_to_remove); + return llama_swap_lora_from_cache_internal(ctx, path_lora_to_apply, n_threads, path_lora_to_remove); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to remove cached lora adapter: %s\n", __func__, err.what()); return 1; diff --git a/llama.h b/llama.h index 9a27d59bd62dc..7b3c2b0b27152 100644 --- a/llama.h +++ b/llama.h @@ -186,7 +186,7 @@ extern "C" { const char * path_base_model, int n_threads); - LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, const char * path_base_model, int n_threads, const char * path_lora_to_remove); + LLAMA_API int llama_swap_lora_from_cache(struct llama_context * ctx, const char * path_lora_to_apply, int n_threads, const char * path_lora_to_remove); // Returns the number of tokens in the KV cache