From 527b6fba1d237befb324fd846bda7418c0fa394d Mon Sep 17 00:00:00 2001 From: Didzis Gosko Date: Sat, 24 Jun 2023 11:47:58 +0300 Subject: [PATCH 01/11] llama : make model stateless and context stateful (llama_state) (#1797) * llama : make model stateless and context stateful * llama : minor cleanup * llama : update internal API declaration * Apply suggestions from code review fix style Co-authored-by: Georgi Gerganov * Missing model memory release * Fix style * Add deprecated warning for public API function llama_init_from_file * Update public API use cases: move away from deprecated llama_init_from_file * Deprecate public API function llama_apply_lora_from_file --------- Co-authored-by: Georgi Gerganov --- examples/common.cpp | 22 ++- examples/common.h | 3 +- examples/embedding/embedding.cpp | 6 +- examples/main/main.cpp | 8 +- examples/perplexity/perplexity.cpp | 6 +- examples/quantize-stats/quantize-stats.cpp | 15 +- examples/save-load-state/save-load-state.cpp | 29 ++- examples/server/server.cpp | 9 +- examples/simple/simple.cpp | 8 +- .../train-text-from-scratch.cpp | 5 +- llama.cpp | 172 ++++++++++++------ llama.h | 35 +++- tests/test-tokenizer-0.cpp | 16 +- 13 files changed, 243 insertions(+), 91 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027d8a8..6ac4845559172 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -536,7 +536,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } -struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { +std::tuple llama_init_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; @@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; - llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); + } + llama_context * lctx = llama_new_context_with_model(model, lparams); if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return NULL; + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } if (!params.lora_adapter.empty()) { - int err = llama_apply_lora_from_file(lctx, + int err = llama_model_apply_lora_from_file(model, params.lora_adapter.c_str(), params.lora_base.empty() ? NULL : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - return NULL; + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } } - return lctx; + return std::make_tuple(model, lctx); } void console_init(console_state & con_st) { diff --git a/examples/common.h b/examples/common.h index 6c2953cb2a7c6..713320179e2be 100644 --- a/examples/common.h +++ b/examples/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined (_WIN32) #include @@ -95,7 +96,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s // Model utils // -struct llama_context * llama_init_from_gpt_params(const gpt_params & params); +std::tuple llama_init_from_gpt_params(const gpt_params & params); // // Console utils diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 860f99f672c9c..369eac1d1c391 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -37,11 +37,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -90,6 +91,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9cc756..c1e6bf126804e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -107,12 +107,13 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; g_ctx = &ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -139,6 +140,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } @@ -147,6 +149,7 @@ int main(int argc, char ** argv) { if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); llama_free(ctx); + llama_free_model(model); return 0; } @@ -666,6 +669,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0afc0b7..b59f5971e3dd2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -149,11 +149,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -169,6 +170,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6b8018ee28432..9cea472dedb82 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -320,6 +320,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Loading model\n"); const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; llama_context * ctx; { @@ -330,12 +331,20 @@ int main(int argc, char ** argv) { lparams.f16_kv = false; lparams.use_mlock = false; - ctx = llama_init_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), lparams); - if (ctx == NULL) { + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return 1; } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return 1; + } } const auto &tensors = llama_internal_get_tensor_map(ctx); @@ -357,6 +366,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); + llama_free_model(model); return 1; } included_layers++; @@ -415,6 +425,7 @@ int main(int argc, char ** argv) { llama_free(ctx); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index da4d37ad03de7..4c868850317fe 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -35,12 +35,22 @@ int main(int argc, char ** argv) { auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init - auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == nullptr) { + return 1; + } + auto ctx = llama_new_context_with_model(model, lparams); + if (ctx == nullptr) { + llama_free_model(model); + return 1; + } auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } @@ -84,6 +94,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } n_past += 1; @@ -91,23 +103,27 @@ int main(int argc, char ** argv) { printf("\n\n"); - // free old model + // free old context llama_free(ctx); - // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + // make new context + auto ctx2 = llama_new_context_with_model(model, lparams); // Load state (rng, logits, embedding and kv_cache) from file { FILE *fp_read = fopen("dump_state.bin", "rb"); if (state_size != llama_get_state_size(ctx2)) { fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } const size_t ret = fread(state_mem, 1, state_size, fp_read); if (ret != state_size) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } @@ -138,6 +154,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } n_past += 1; @@ -145,5 +163,8 @@ int main(int argc, char ** argv) { printf("\n\n"); + llama_free(ctx2); + llama_free_model(model); + return 0; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0984aadb92ba..de22d301342d6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -115,6 +115,7 @@ struct llama_server_context { std::vector embd; std::vector last_n_tokens; + llama_model * model = nullptr; llama_context * ctx = nullptr; gpt_params params; @@ -130,6 +131,10 @@ struct llama_server_context { llama_free(ctx); ctx = nullptr; } + if (model) { + llama_free_model(model); + model = nullptr; + } } void rewind() { @@ -150,8 +155,8 @@ struct llama_server_context { bool loadModel(const gpt_params & params_) { params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == nullptr) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) { LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 76f991cdc028f..fc45c93406bc4 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -68,11 +68,12 @@ int main(int argc, char ** argv) llama_init_backend(); - llama_context * ctx ; + llama_model * model; + llama_context * ctx; - ctx = llama_init_from_gpt_params( params ); + std::tie(model, ctx) = llama_init_from_gpt_params( params ); - if ( ctx == NULL ) + if ( model == NULL ) { fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); return 1; @@ -170,6 +171,7 @@ int main(int argc, char ** argv) } // wend of main loop llama_free( ctx ); + llama_free_model( model ); return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7ec85951adc57..61c829e5c0f8a 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) { struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; - struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); struct llama_vocab vocab; { @@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); + llama_free_model(lmodel); ggml_free(model.ctx); return 0; diff --git a/llama.cpp b/llama.cpp index e597f5048234b..a528eef4a9020 100644 --- a/llama.cpp +++ b/llama.cpp @@ -182,6 +182,19 @@ struct llama_kv_cache { } }; +struct llama_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + struct llama_model { e_model type = MODEL_UNKNOWN; @@ -198,10 +211,6 @@ struct llama_model { // context struct ggml_context * ctx = NULL; - // key + value cache for the self attention - // TODO: move to llama_state - struct llama_kv_cache kv_self; - // the model memory buffer llama_ctx_buffer buf; @@ -215,6 +224,11 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + llama_vocab vocab; + ~llama_model() { if (ctx) { ggml_free(ctx); @@ -233,24 +247,11 @@ struct llama_model { } }; -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - struct llama_context { + llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + std::mt19937 rng; - int64_t t_load_us = 0; - int64_t t_start_us = 0; bool has_evaluated_once = false; int64_t t_sample_us = 0; @@ -261,8 +262,16 @@ struct llama_context { int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - llama_model model; - llama_vocab vocab; + const llama_model & model; + const llama_vocab & vocab; + + bool model_owner = false; + + int64_t t_load_us; + int64_t t_start_us; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; size_t mem_per_token = 0; @@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) { static void llama_model_load_internal( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1047,12 +1057,11 @@ static void llama_model_load_internal( llama_progress_callback progress_callback, void * progress_callback_user_data) { - lctx.t_start_us = ggml_time_us(); + model.t_start_us = ggml_time_us(); std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); - lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); - auto & model = lctx.model; + vocab = std::move(ml->file_loaders.at(0)->vocab); model.hparams = ml->file_loaders.at(0)->hparams; model.n_gpu_layers = n_gpu_layers; llama_file_version file_version = ml->file_loaders.at(0)->file_version; @@ -1122,15 +1131,15 @@ static void llama_model_load_internal( // create the ggml context { - lctx.model.buf.resize(ctx_size); + model.buf.resize(ctx_size); if (use_mlock) { - lctx.model.mlock_buf.init(lctx.model.buf.addr); - lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + model.mlock_buf.init(model.buf.addr); + model.mlock_buf.grow_to(model.buf.size); } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size, - /*.mem_buffer =*/ lctx.model.buf.addr, + /*.mem_size =*/ model.buf.size, + /*.mem_buffer =*/ model.buf.addr, /*.no_alloc =*/ ml->use_mmap, }; @@ -1311,7 +1320,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -1321,12 +1330,13 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration - lctx.t_load_us = ggml_time_us() - lctx.t_start_us; + model.t_load_us = ggml_time_us() - model.t_start_us; } static bool llama_model_load( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1340,7 +1350,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1378,7 +1388,7 @@ static bool llama_eval_internal( const auto & model = lctx.model; const auto & hparams = model.hparams; - const auto & kv_self = model.kv_self; + const auto & kv_self = lctx.kv_self; LLAMA_ASSERT(!!kv_self.ctx); @@ -1726,7 +1736,7 @@ static bool llama_eval_internal( //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); // update kv token count - lctx.model.kv_self.n = n_past + N; + lctx.kv_self.n = n_past + N; // extract logits { @@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // interface implementation // -struct llama_context * llama_init_from_file( +struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_context_params params) { ggml_time_init(); - llama_context * ctx = new llama_context; + llama_model * model = new llama_model; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, + params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + delete model; + fprintf(stderr, "%s: failed to load model\n", __func__); + return nullptr; + } + + return model; +} + +void llama_free_model(struct llama_model * model) { + delete model; +} + +struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params) { + + if (!model) { + return nullptr; + } + + llama_context * ctx = new llama_context(*model, model->vocab); if (params.seed < 0) { params.seed = time(NULL); @@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, - params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { - fprintf(stderr, "%s: failed to load model\n", __func__); - llama_free(ctx); - return nullptr; - } - // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { - const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file( LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); @@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file( return ctx; } +struct llama_context * llama_init_from_file( + const char * path_model, + struct llama_context_params params) { + + struct llama_model * model = llama_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } + struct llama_context * ctx = llama_new_context_with_model(model, params); + ctx->model_owner = true; + return ctx; +} + void llama_free(struct llama_context * ctx) { + if (ctx->model_owner) { + delete &ctx->model; + } delete ctx; } @@ -2765,11 +2810,9 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - auto & model = ctx->model; - const int64_t t_start_lora_us = ggml_time_us(); auto fin = std::ifstream(path_lora, std::ios::binary); @@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; @@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->model.kv_self.n; + return ctx->kv_self.n; } #define LLAMA_MAX_RNG_STATE (64*1024) @@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->model.kv_self.buf.size; + const size_t s_kv = ctx->kv_self.buf.size; const size_t s_total = ( + s_rng_size @@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { // copy kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_free(cpy_ctx); } - ctx->model.kv_self.n = kv_ntok; + ctx->kv_self.n = kv_ntok; } const size_t nread = inp - src; @@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) { } // For internal test use -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } diff --git a/llama.h b/llama.h index 0de530d456932..a833a7f4d66cc 100644 --- a/llama.h +++ b/llama.h @@ -26,6 +26,14 @@ # define LLAMA_API #endif +#ifdef __GNUC__ +# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define DEPRECATED(func, hint) func +#endif + #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' @@ -53,6 +61,7 @@ extern "C" { // TODO: show sample usage // + struct llama_model; struct llama_context; typedef int llama_token; @@ -136,12 +145,23 @@ extern "C" { LLAMA_API int64_t llama_time_us(); + LLAMA_API struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_context_params params); + + LLAMA_API void llama_free_model(struct llama_model * model); + + LLAMA_API struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - LLAMA_API struct llama_context * llama_init_from_file( + LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file( const char * path_model, - struct llama_context_params params); + struct llama_context_params params), + "please use llama_load_model_from_file combined with llama_new_context_with_model instead"); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -158,8 +178,15 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API int llama_apply_lora_from_file( + LLAMA_API DEPRECATED(int llama_apply_lora_from_file( struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads), + "please use llama_model_apply_lora_from_file instead"); + + LLAMA_API int llama_model_apply_lora_from_file( + const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads); @@ -310,7 +337,7 @@ extern "C" { #include struct ggml_tensor; -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); #endif diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index ab1538a0cf304..20abe710018ee 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -28,6 +28,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + llama_model * model; llama_context * ctx; // load the vocab @@ -36,10 +37,18 @@ int main(int argc, char **argv) { lparams.vocab_only = true; - ctx = llama_init_from_file(fname.c_str(), lparams); + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); return 1; } } @@ -48,6 +57,8 @@ int main(int argc, char **argv) { if (n_vocab != 32000) { fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); + llama_free_model(model); + llama_free(ctx); return 2; } @@ -77,10 +88,13 @@ int main(int argc, char **argv) { } fprintf(stderr, "\n"); + llama_free_model(model); + llama_free(ctx); return 3; } } + llama_free_model(model); llama_free(ctx); return 0; From b061ba9e2a7a2c335a200df8c11aed5e31e4ccbb Mon Sep 17 00:00:00 2001 From: Alex Renda Date: Sat, 24 Jun 2023 03:15:01 -0700 Subject: [PATCH 02/11] llama : fix top-p sampling to match the canonical definition (#1953) * Fix top-p sampling to match the standard definition (smallest set that has probability mass at least p, not largest set with probability mass less than p) * top-p: correct gt to gte * add test for correct top-p behavior --- llama.cpp | 7 ++++--- tests/test-sampling.cpp | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index a528eef4a9020..ac22a48f8ab97 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2015,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can for (size_t i = 0; i < candidates->size; ++i) { cum_sum += candidates->data[i].p; - // Check if the running sum is greater than p or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep) { - last_idx = i; + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= p && i + 1 >= min_keep) { + last_idx = i + 1; break; } } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 5d693f7b561a6..64f9455d72e54 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -181,6 +181,7 @@ int main(void) { test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); From 235b610d650cbfed6dbd5d671f750d35fc18cd7d Mon Sep 17 00:00:00 2001 From: Alberto <57916483+albbus-stack@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:32:13 +0200 Subject: [PATCH 03/11] readme : fixed termux instructions (#1973) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b09498be64cd0..10462c6b0651f 100644 --- a/README.md +++ b/README.md @@ -680,12 +680,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled ``` GGML_OPENCL_PLATFORM=0 GGML_OPENCL_DEVICE=0 -export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH -./main (...) +export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH ``` For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. + ### Docker #### Prerequisites From 11da1a85cd69af84b5861134738c7e9e20907470 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 13:38:18 +0300 Subject: [PATCH 04/11] readme : fix whitespaces --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10462c6b0651f..6aa6ce319d93f 100644 --- a/README.md +++ b/README.md @@ -685,7 +685,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. -Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. ### Docker From f2c754e1c38936fdde74e4848ac468a696eb73c6 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 24 Jun 2023 12:57:18 +0200 Subject: [PATCH 05/11] ggml : improve ggml_graph_dump_dot, add ggml_format_name (#1978) * Improve ggml_graph_dump_dot, add ggml_format_name * add more automatic names to view ops * fix name of copies --- ggml.c | 137 ++++++++++++++++++++++++++++++++++++++++----------------- ggml.h | 1 + 2 files changed, 98 insertions(+), 40 deletions(-) diff --git a/ggml.c b/ggml.c index 4319683f5186e..ef9e8585d9deb 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include @@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam return tensor; } +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); + va_end(args); + return tensor; +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + ggml_format_name(result, "%s (view)", src->name); result->nb[0] = src->nb[0]; result->nb[1] = src->nb[1]; @@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl( // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); + if (strlen(b->name) > 0) { + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + } else { + ggml_format_name(result, "%s (copy)", a->name); + } result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl( } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d( const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d( const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d( const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d( const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); int ne[GGML_MAX_DIMS]; int nb[GGML_MAX_DIMS]; @@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } cgraph->leafs[cgraph->n_leafs] = node; @@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; @@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr return NULL; } +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + gparent0 ? (void *) gparent0 : (void *) parent, + gparent0 ? "g" : "x", + gparent ? (void *) gparent : (void *) node, + gparent ? "g" : "x", + gparent ? "empty" : "vee", + gparent ? "dashed" : "solid", + label); +} + +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", + (void *) parent, "x", + (void *) node, "x", + label); +} + void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; @@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s |", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } if (node->n_dims == 2) { @@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); } - if (node->grad) { fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); } else { @@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s | ", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (ggml_nelements(node) == 1) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, 0)); - } - else { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0)); + + fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + if (ggml_nelements(node) < 5) { + fprintf(fp, " | ("); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + } + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + } + else { + fprintf(fp, "#"); + } + if (j < ggml_nelements(node) - 1) { + fprintf(fp, ", "); + } } - } - else { - fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + fprintf(fp, ")"); } fprintf(fp, "\"; ]\n"); } @@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); - if (node->src0) { - struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", - parent0 ? (void *) parent0 : (void *) node->src0, - parent0 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); } if (node->src1) { - struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", - parent1 ? (void *) parent1 : (void *) node->src1, - parent1 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + } } } @@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph struct ggml_tensor * node = gb->leafs[i]; if (node->src0) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", - (void *) node->src0, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); } if (node->src1) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", - (void *) node->src1, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + } } } diff --git a/ggml.h b/ggml.h index 18c78551f3dcd..4b6b7284510f9 100644 --- a/ggml.h +++ b/ggml.h @@ -563,6 +563,7 @@ extern "C" { GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); + GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); // // operations on tensors with backpropagation From c943d823c14cef33092205ca3944de6fdf7abf99 Mon Sep 17 00:00:00 2001 From: AN Long Date: Sat, 24 Jun 2023 19:02:06 +0800 Subject: [PATCH 06/11] convert : fix invalid params in write_vocab_only (#1975) --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index de6c39c67672b..e340d2273f378 100644 --- a/convert.py +++ b/convert.py @@ -998,9 +998,9 @@ def write_vocab(self, vocab: Vocab) -> None: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) + n_head=1, n_layer=0) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) of.fout.close() From fdd18609113862dc6eb34dfc44a093d54c59ff1f Mon Sep 17 00:00:00 2001 From: Rowan Hart Date: Sat, 24 Jun 2023 04:07:08 -0700 Subject: [PATCH 07/11] flake : fix ggml-metal.metal path and run nixfmt (#1974) --- flake.nix | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/flake.nix b/flake.nix index bba3d71f7437b..cebb47b94c92e 100644 --- a/flake.nix +++ b/flake.nix @@ -9,27 +9,33 @@ inherit (pkgs.stdenv) isAarch64 isDarwin; inherit (pkgs.lib) optionals; isM1 = isAarch64 && isDarwin; - osSpecific = - if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ] - else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ] - else [ ]; - pkgs = import nixpkgs { - inherit system; - }; - llama-python = pkgs.python310.withPackages (ps: with ps; [ - numpy - sentencepiece - ]); - in - { + osSpecific = if isM1 then + with pkgs.darwin.apple_sdk_11_0.frameworks; [ + Accelerate + MetalKit + MetalPerformanceShaders + MetalPerformanceShadersGraph + ] + else if isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] + else + [ ]; + pkgs = import nixpkgs { inherit system; }; + llama-python = + pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); + in { packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = - if isM1 then '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";" - '' else ""; + postPatch = if isM1 then '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + '' else + ""; nativeBuildInputs = with pkgs; [ cmake ]; buildInputs = osSpecific; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ @@ -62,11 +68,7 @@ }; apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { - packages = with pkgs; [ - cmake - llama-python - ] ++ osSpecific; + packages = with pkgs; [ cmake llama-python ] ++ osSpecific; }; - } - ); + }); } From 65bdd52a867539691007f85c5508146d507f72c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 19:40:18 +0300 Subject: [PATCH 08/11] tests : sync test-grad0 from ggml --- tests/test-grad0.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index c8c2c0f717e32..b5a499c1db57e 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -1,3 +1,4 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" #include @@ -5,6 +6,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define MAX_NARGS 3 #undef MIN @@ -197,8 +202,23 @@ bool check_gradient( float max_error_abs, float max_error_rel) { + static int n_threads = -1; + if (n_threads < 0) { + n_threads = GGML_DEFAULT_N_THREADS; + + const char *env = getenv("GGML_N_THREADS"); + if (env) { + n_threads = atoi(env); + } + + printf("GGML_N_THREADS = %d\n", n_threads); + } + struct ggml_cgraph gf = ggml_build_forward (f); + gf.n_threads = n_threads; + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + gb.n_threads = n_threads; ggml_graph_compute(ctx0, &gf); ggml_graph_reset (&gf); From 5ec8dd5a3c6a9a109351d2257bb9d53869bd0a94 Mon Sep 17 00:00:00 2001 From: Robyn Date: Sun, 25 Jun 2023 04:10:29 +1000 Subject: [PATCH 09/11] #1869 Fix null reference errors when training from scratch with CUDA (#1907) * #1869 Fix null reference errors when training from scratch with CUDA build Calling ggml_compute_forward when node->src0 was null was causing train-text-from-scratch.exe to terminate unexpectedly. * ggml : do not dereference src0 if NULL --------- Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 2 +- ggml.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 36a251ecce973..010682edb703c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT + || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); switch (tensor->op) { diff --git a/ggml.c b/ggml.c index ef9e8585d9deb..7104be01b87c7 100644 --- a/ggml.c +++ b/ggml.c @@ -14911,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS From e65ca7e14ac76c4046091da39d41a9017abaa9b3 Mon Sep 17 00:00:00 2001 From: sjinzh Date: Sun, 25 Jun 2023 13:45:44 +0800 Subject: [PATCH 10/11] zig : upgrade build system support (#1981) * upgrade zig build system support * zig : add new line at the end of the file --------- Co-authored-by: Georgi Gerganov --- build.zig | 91 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/build.zig b/build.zig index 306127ffe2a73..49c159ebf1e10 100644 --- a/build.zig +++ b/build.zig @@ -1,61 +1,58 @@ const std = @import("std"); +// Zig Version: 0.11.0-dev.3379+629f0d23b pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); - const optimize = b.standardReleaseOptions(); - const want_lto = b.option(bool, "lto", "Want -fLTO"); - - const lib = b.addStaticLibrary("llama", null); - lib.want_lto = want_lto; - lib.setTarget(target); - lib.setBuildMode(optimize); + const optimize = b.standardOptimizeOption(.{}); + const lib = b.addStaticLibrary(.{ + .name = "llama", + .target = target, + .optimize = optimize, + }); + lib.linkLibC(); lib.linkLibCpp(); lib.addIncludePath("."); - lib.addIncludePath("examples"); + lib.addIncludePath("./examples"); lib.addCSourceFiles(&.{ "ggml.c", }, &.{"-std=c11"}); lib.addCSourceFiles(&.{ "llama.cpp", }, &.{"-std=c++11"}); - lib.install(); - - const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto }; - - const exe = build_example("main", build_args); - _ = build_example("quantize", build_args); - _ = build_example("perplexity", build_args); - _ = build_example("embedding", build_args); - - // create "zig build run" command for ./main - - const run_cmd = exe.run(); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| { - run_cmd.addArgs(args); + b.installArtifact(lib); + + const examples = .{ + "main", + "baby-llama", + "embedding", + // "metal", + "perplexity", + "quantize", + "quantize-stats", + "save-load-state", + // "server", + "simple", + "train-text-from-scratch", + }; + + inline for (examples) |example_name| { + const exe = b.addExecutable(.{ + .name = example_name, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath("."); + exe.addIncludePath("./examples"); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}), + "examples/common.cpp", + }, &.{"-std=c++11"}); + exe.linkLibrary(lib); + b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| run_cmd.addArgs(args); + const run_step = b.step("run_" ++ example_name, "Run the app"); + run_step.dependOn(&run_cmd.step); } - - const run_step = b.step("run", "Run the app"); - run_step.dependOn(&run_cmd.step); -} - -fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { - const b = args.b; - const lib = args.lib; - const want_lto = args.want_lto; - - const exe = b.addExecutable(name, null); - exe.want_lto = want_lto; - lib.setTarget(args.target); - lib.setBuildMode(args.optimize); - exe.addIncludePath("."); - exe.addIncludePath("examples"); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - exe.install(); - - return exe; } From 66a2555ba6cab954c56d653b29c27bfbbacfbfb1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jun 2023 09:07:03 +0300 Subject: [PATCH 11/11] readme : add Azure CI discussion link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6aa6ce319d93f..3a71e16db1aa3 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729