From 5588beb6d8846d1e1da3fed3fb71902272eaf9ee Mon Sep 17 00:00:00 2001 From: icpp Date: Thu, 23 Jan 2025 11:06:14 -0500 Subject: [PATCH 01/25] Update to llama.cpp sha 615212 main_.cpp --- src/main_.cpp | 629 +++++++++++++++++++++++--------------------------- 1 file changed, 288 insertions(+), 341 deletions(-) diff --git a/src/main_.cpp b/src/main_.cpp index 29f781a..ae3b24d 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -5,15 +5,14 @@ #include "utils.h" #include "main_.h" // ICPP-PATCH-END - +#include "arg.h" #include "common.h" - #include "console.h" +#include "log.h" +#include "sampling.h" #include "llama.h" +#include "chat-template.hpp" -#include -#include -#include #include #include #include @@ -39,16 +38,28 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant"; + static llama_context ** g_ctx; // static llama_model ** g_model; // Make this a global variable, accessible from common.cpp llama_model ** g_model; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); +} + static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -61,61 +72,22 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } -static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_dump_timing_info_yaml(logfile, ctx); - fclose(logfile); -} - //icpp-start NO CONSOLE // #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) // static void sigint_handler(int signo) { // if (signo == SIGINT) { // if (!is_interacting && g_params->interactive) { -// is_interacting = true; +// is_interacting = true; +// need_insert_eot = true; // } else { // console::cleanup(); -// printf("\n"); -// llama_print_timings(*g_ctx); -// write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); +// LOG("\n"); +// common_perf_print(*g_ctx, *g_smpl); +// +// // make sure all logs are flushed +// LOG("Interrupted by user\n"); +// common_log_pause(common_log_main()); +// // _exit(130); // } // } @@ -123,49 +95,26 @@ static void write_logfile( // #endif //icpp-end NO CONSOLE -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { - llama_chat_msg new_msg{role, content}; - auto formatted = llama_chat_format_single( - model, g_params->chat_template, chat_msgs, new_msg, role == "user"); - chat_msgs.push_back({role, content}); - return formatted; -} - int main_(int argc, char ** argv, std::string principal_id, bool load_model_only, std::string &icpp_error_msg, std::ostringstream &conversation_ss, std::ostringstream &output_ss, const uint64_t &max_tokens, std::string &prompt_remaining, bool &generated_eog) { std::cout << std::string(__func__) << " Called with following arguments: " << std::endl; std::cout << "- principal_id = " << principal_id << std::endl; std::cout << "- load_model_only = " << load_model_only << std::endl; std::cout << "- max_tokens = " << max_tokens << std::endl; - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { // ICPP-PATCH-START - icpp_error_msg = "Error in gpt_params_print_usage."; + icpp_error_msg = "Error in common_params_parse."; // ICPP-PATCH-END return 1; } - llama_sampling_params & sparams = params.sparams; + common_init(); -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("main", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - // TODO: Dump params ? - //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); + auto & sparams = params.sampling; // save choice to use color for later // (note for later: this is a slightly awkward choice) @@ -173,53 +122,42 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only //icpp-patch atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); - } - - LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_INF("%s: llama backend init\n", __func__); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); static llama_model * model; // ICPP-PATCH: use static to preserve accross calls static llama_context * ctx; // ICPP-PATCH: use static to preserve accross calls - llama_context * ctx_guidance = NULL; - std::vector chat_msgs; + common_sampler * smpl = nullptr; // ICPP-PATCH-START // Don't give error if embd_inp = session_tokens. All is OK to just keep going @@ -238,17 +176,19 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only g_model = &model; g_ctx = &ctx; + g_smpl = &smpl; + + std::vector chat_msgs; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (sparams.cfg_scale > 1.f) { - struct llama_context_params lparams = llama_context_params_from_gpt_params(params); - ctx_guidance = llama_new_context_with_model(model, lparams); - } + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); + common_init_result llama_init = common_init_from_params(params); + + model = llama_init.model.get(); + ctx = llama_init.context.get(); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: error: unable to load model\n", __func__); // ICPP-PATCH-START icpp_error_msg = std::format("{}: error: unable to load model)", __func__); // ICPP-PATCH-END @@ -264,28 +204,81 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } // ICPP-PATCH-END - const int n_ctx_train = llama_n_ctx_train(model); + // ICPP-TODO-START: This section is completely new... + const llama_vocab * vocab = llama_model_get_vocab(model); + auto chat_templates = common_chat_templates_from_model(model, params.chat_template); + + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + set_process_priority(params.cpuparams.priority); + + struct ggml_threadpool * threadpool_batch = NULL; + if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); + if (!threadpool_batch) { + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; + } + + // Start the non-batch threadpool in the paused state + tpp.paused = true; + } + + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); + if (!threadpool) { + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; + } + + llama_attach_threadpool(ctx, threadpool, threadpool_batch); + // ICPP-TODO-END + + const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); + } + + // auto enable conversation mode if chat template is available + const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default; + if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { + if (has_chat_template) { + LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } else { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + } + } + + // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning + if (params.conversation_mode && !has_chat_template) { + LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); } // print chat template example in conversation mode - if (params.conversation) { + if (params.conversation_mode) { if (params.enable_chat_template) { - LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str()); } else { - LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); } std::string path_session = params.path_prompt_cache; @@ -299,45 +292,56 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-END std::vector session_tokens; - if (!path_session.empty()) { - LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + if (!path_session.empty()) { + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); if (!file_exists(path_session)) { - LOG_TEE("%s: session file does not exist, will create.\n", __func__); + LOG_INF("%s: session file does not exist, will create.\n", __func__); } else if (file_is_empty(path_session)) { - LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); // ICPP-PATCH-START icpp_error_msg = std::format("{}: error: failed to load session file '{}')", __func__, path_session.c_str()); // ICPP-PATCH-END return 1; } session_tokens.resize(n_token_count_out); - LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } - const bool add_bos = llama_should_add_bos_token(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); if (!llama_model_has_encoder(model)) { - GGML_ASSERT(llama_add_eos_token(model) != 1); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); } - LOG("add_bos: %d\n", add_bos); + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); std::vector embd_inp; + auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { + common_chat_msg new_msg{role, content}; + auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja); + chat_msgs.push_back({role, content}); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); + return formatted; + }; + { - auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) - ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode + auto prompt = (params.conversation_mode && params.enable_chat_template) + // format the system prompt in conversation mode (fallback to default if empty) + ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt) + // otherwise use the prompt as is : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG_TEE("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, prompt, true, true); + LOG_DBG("tokenize the prompt\n"); + embd_inp = common_tokenize(ctx, prompt, true, true); } else { - LOG_TEE("use session tokens\n"); + LOG_DBG("use session tokens\n"); embd_inp = session_tokens; // ICPP-PATCH-START embd_inp_is_session_tokens = true; @@ -350,9 +354,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } // ICPP-PATCH-END - LOG_TEE("prompt: \"%s\"\n", log_tostr(prompt)); - LOG_TEE("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); - LOG_TEE("# tokens: %s\n", std::to_string(embd_inp.size()).c_str()); + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); } // Should not run without any tokens @@ -367,34 +370,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } // Tokenize negative prompt - std::vector guidance_inp; - int guidance_offset = 0; - int original_prompt_len = 0; - if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); - LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); - LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); - - original_prompt_len = original_inp.size(); - guidance_offset = (int)guidance_inp.size() - original_prompt_len; - LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); - LOG("guidance_offset: %s", log_tostr(guidance_offset)); - } - // ICPP-PATCH-START // when the prompt is empty, then embd_inp = session_tokens, and all is OK to just keep going. if (!embd_inp_is_session_tokens) { // ICPP-PATCH-END if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); // ICPP-PATCH-START icpp_error_msg = std::format("{}: error: prompt is too long ({} tokens, max {})", __func__, (int) embd_inp.size(), n_ctx - 4); // ICPP-PATCH-END - return 1; + return 1; } // ICPP-PATCH-START } @@ -411,29 +396,28 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_TEE("%s: using full prompt from session file\n", __func__); + LOG_INF("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + LOG_INF("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); } else { - LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); } // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } - LOGLN( - "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", - log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); + LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1); } @@ -445,7 +429,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only params.n_keep += add_bos; // always keep the BOS token } - if (params.conversation) { + if (params.conversation_mode) { params.interactive_first = true; } @@ -455,30 +439,20 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); - } - - if (ctx_guidance) { - LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); - LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); - for (int i = 0; i < (int) guidance_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); - } + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG_CNT("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } // ctrl+C handling @@ -500,47 +474,56 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only //icpp-patch-end if (params.interactive) { - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG_INF("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { - LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + auto tmp = common_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } } if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG_INF("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + auto tmp = common_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + auto tmp = common_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } } - LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); - LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + smpl = common_sampler_init(model, sparams); + if (!smpl) { + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + return 1; + } + + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); + + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -554,9 +537,9 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } - LOG_TEE("\n\n"); + LOG_INF("\n"); if (params.interactive) { const char * control_message; @@ -568,11 +551,15 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG_INF("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG_INF( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG_INF( "%s", control_message); + if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) { + LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n"); + } + LOG_INF("\n"); is_interacting = params.interactive_first; } @@ -586,7 +573,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only int n_remain = params.n_predict; int n_consumed = 0; int n_session_consumed = 0; - int n_past_guidance = 0; // ICPP-PATCH-START // We can only handle max_tokens evaluations per call @@ -612,28 +598,21 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only display = params.display_prompt; std::vector embd; - std::vector embd_guidance; // tokenized antiprompts std::vector> antiprompt_ids; antiprompt_ids.reserve(params.antiprompt.size()); for (const std::string & antiprompt : params.antiprompt) { - antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); - } - - struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - if (!ctx_sampling) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); + antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true)); } if (llama_model_has_encoder(model)) { int enc_input_size = embd_inp.size(); llama_token * enc_input_buf = embd_inp.data(); - if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { + LOG_ERR("%s : failed to eval\n", __func__); // ICPP-PATCH-START icpp_error_msg = std::format("{}: error: failed to eval (-1-)", __func__); // ICPP-PATCH-END @@ -641,8 +620,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { - decoder_start_token_id = llama_token_bos(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); } embd_inp.clear(); @@ -662,9 +641,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only embd.resize(max_embd_size); // console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); // console::set_display(console::reset); - fflush(stdout); } if (ga_n == 1) { @@ -672,16 +650,22 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) >= n_ctx) { + + if (n_past + (int) embd.size() >= n_ctx) { + if (!params.ctx_shift){ + LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); + break; + } + if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); @@ -689,15 +673,11 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only n_past -= n_discard; - if (ctx_guidance) { - n_past_guidance -= n_discard; - } - - LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + LOG_DBG("after swap: n_past = %d\n", n_past); - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - LOG("clear session path\n"); + LOG_DBG("clear session path\n"); path_session.clear(); } } else { @@ -707,10 +687,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only const int bd = (ga_w/ga_n)*(ga_n - 1); const int dd = (ga_w/ga_n) - ib*bd - ga_w; - LOG("\n"); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); @@ -720,7 +700,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only ga_i += ga_w/ga_n; - LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); } } @@ -757,49 +737,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } } - // evaluate tokens in batches - // embd is typically prepared beforehand to fit within a batch, but not always - if (ctx_guidance) { - int input_size = 0; - llama_token * input_buf = NULL; - - if (n_past_guidance < (int) guidance_inp.size()) { - // Guidance context should have the same data with these modifications: - // - // * Replace the initial prompt - // * Shift everything by guidance_offset - embd_guidance = guidance_inp; - if (embd.begin() + original_prompt_len < embd.end()) { - embd_guidance.insert( - embd_guidance.end(), - embd.begin() + original_prompt_len, - embd.end() - ); - } - - input_buf = embd_guidance.data(); - input_size = embd_guidance.size(); - - LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); - } else { - input_buf = embd.data(); - input_size = embd.size(); - } - - for (int i = 0; i < input_size; i += params.n_batch) { - int n_eval = std::min(input_size - i, params.n_batch); - if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); - // ICPP-PATCH-START - icpp_error_msg = std::format("{}: error: failed to eval (-2-)", __func__); - // ICPP-PATCH-END - return 1; - } - - n_past_guidance += n_eval; - } - } - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { @@ -813,10 +750,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } // ICPP-PATCH-END - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); // ICPP-PATCH-START icpp_error_msg = std::format("{}: error: failed to eval (-3-)", __func__); // ICPP-PATCH-END @@ -825,17 +762,17 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } // ICPP-PATCH-START // Keep track of the processed conversation tokens and the remaining prompt for (int j=0; jprev).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -904,16 +840,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG_TEE("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -938,7 +874,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only int n_prompt_tokens_remaining = 0; size_t iii = 0; for (auto id : embd_inp) { - const std::string token_str = llama_token_to_piece(ctx, id, true); // include special tokens + const std::string token_str = common_token_to_piece(ctx, id, true); // include special tokens if (iii < n_consumed) { prompt_consumed += token_str; } else { @@ -954,10 +890,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, params.special); + const std::string token_str = common_token_to_piece(ctx, id, params.special); // Console/Stream Output - fprintf(stdout, "%s", token_str.c_str()); + LOG("%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check @@ -969,8 +905,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only output_tokens.push_back(id); output_ss << token_str; } - - fflush(stdout); } } @@ -985,7 +919,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { const int n_prev = 32; - const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -1007,7 +941,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } // check for reverse prompt using special tokens - llama_token last_token = llama_sampling_last(ctx_sampling); + llama_token last_token = common_sampler_last(smpl); for (std::vector ids : antiprompt_ids) { if (ids.size() == 1 && last_token == ids[0]) { if (params.interactive) { @@ -1019,52 +953,52 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } if (is_antiprompt) { - LOG("found antiprompt: %s\n", last_output.c_str()); + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); } } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { - LOG("found an EOG token\n"); + if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { + LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); + const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } if (params.enable_chat_template) { - chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + chat_add_and_format("assistant", assistant_ss.str()); } is_interacting = true; - printf("\n"); + LOG("\n"); } } // if current token is not EOG, we add it to current assistant message - if (params.conversation) { - auto id = llama_sampling_last(ctx_sampling); - assistant_ss << llama_token_to_piece(ctx, id, false); + if (params.conversation_mode) { + const auto id = common_sampler_last(smpl); + assistant_ss << common_token_to_piece(ctx, id, false); } if (n_past > 0 && is_interacting) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); - if (params.conversation) { - printf("\n> "); + if (params.conversation_mode) { + LOG("\n> "); } if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(model)); + LOG_DBG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_vocab_bos(vocab)); } std::string buffer; - if (!params.input_prefix.empty() && !params.conversation) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - printf("%s", params.input_prefix.c_str()); + if (!params.input_prefix.empty() && !params.conversation_mode) { + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); } // color user input only @@ -1086,12 +1020,12 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // Entering a empty line lets the user pass control back if (buffer.length() > 1) { // append input suffix if any - if (!params.input_suffix.empty() && !params.conversation) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - printf("%s", params.input_suffix.c_str()); + if (!params.input_suffix.empty() && !params.conversation_mode) { + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); @@ -1099,21 +1033,21 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only string_process_escapes(buffer); } - bool format_chat = params.conversation && params.enable_chat_template; + bool format_chat = params.conversation_mode && params.enable_chat_template; std::string user_inp = format_chat - ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) + ? chat_add_and_format("user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); - const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { - llama_token eot = llama_token_eot(model); - embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + llama_token eot = llama_vocab_eot(vocab); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); need_insert_eot = false; } @@ -1124,16 +1058,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } // reset assistant message assistant_ss.str(""); n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -1141,7 +1075,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only if (n_past > 0) { if (is_interacting) { - llama_sampling_reset(ctx_sampling); + common_sampler_reset(smpl); } is_interacting = false; } @@ -1154,8 +1088,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-END // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { - LOG_TEE(" [end of text]\n"); + if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { + LOG(" [end of text]\n"); // break; // we do not break the loop here, but we do it above // once the eog token has been decoded and added to conversation_ss & session_tokens // ICPP-PATCH-START @@ -1179,17 +1113,19 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-START std::cout << "\nSaving " << std::to_string(session_tokens.size()) << " tokens to session file " << path_session << std::endl; // ICPP-PATCH-END - LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - llama_print_timings(ctx); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + LOG("\n\n"); + common_perf_print(ctx, smpl); - if (ctx_guidance) { llama_free(ctx_guidance); } + common_sampler_free(smpl); // ICPP-PATCH-START + // TODO-615212 -- This is old code that we had outcommented + // REMOVE // Do NOT free ctx & model storage // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM // -> we do NOT reset g_ctx & g_model @@ -1197,22 +1133,28 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // llama_free(ctx); // llama_free_model(model); + // TODO-615212 -- Make sure this is correct + // LEAVE IT IN // Do reset all other static memory reset_static_memory(); // ICPP-PATCH-END - llama_sampling_free(ctx_sampling); + // TODO-615212 -- Make sure this is now handled in common_sampler_free + // REMOVE + // llama_sampling_free(ctx_sampling); llama_backend_free(); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } // ICPP-PATCH-START: // functions added for running on IC + +// TODO-615212 -- Make sure this is now handled in common_sampler_free +// REMOVE void free_ctx() { if (g_ctx && *g_ctx) { llama_free(*g_ctx); @@ -1220,6 +1162,9 @@ void free_ctx() { g_ctx = nullptr; } } + +// TODO-615212 -- Make sure this is correct +// LEAVE IT IN void free_model() { if (g_model && *g_model) { llama_free_model(*g_model); @@ -1227,6 +1172,8 @@ void free_model() { g_model = nullptr; } } +// TODO-615212 -- Make sure this is correct +// LEAVE IT IN void reset_static_memory() { // Tip: to find what must be reset, use a native debug build and stop here // in vscode. Then check the static memory section in VARIABLES. From 3c82921abdbcc0a03e723075d6120d3dfbddf062 Mon Sep 17 00:00:00 2001 From: icpp Date: Thu, 23 Jan 2025 21:01:08 -0500 Subject: [PATCH 02/25] build-native : files compile --- .gitignore | 2 +- README-contributors-guide.md | 282 ++++++++++++++++++++++++++++------- README.md | 3 +- icpp.toml | 6 +- src/main_.cpp | 11 +- src/run.cpp | 15 +- 6 files changed, 249 insertions(+), 70 deletions(-) diff --git a/.gitignore b/.gitignore index 6a6ae91..da96b82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Misc -llama_cpp_onicai_fork +llama_cpp_onicai_fork* *.code-workspace x y diff --git a/README-contributors-guide.md b/README-contributors-guide.md index ef460ba..0c5f74e 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -103,13 +103,28 @@ git push origin --tags Take following steps locally: - git fetch -- Copy `src/llama_cpp_onicai_fork` to `/llama_cpp_onica_fork_` - - This is just as a reference. We will remove this folder once all done. +- This is the git-sha of the llama.cpp versions we branched from: + - `615212` (git-sha-new) , with release-tag `b4532` + - `b841d0` (git-sha-old) , no release-tag + - `5cdb37` (git-sha-older), no release-tag + +- Start with a fresh clone of llama_cpp_onicai_fork: + ```bash + # From folder: llama_cpp_canister\src + + # Copy old version, as a reference to use with meld + # This is just as a reference. You can remove this folder once all done. + # (-) Make sure the current `onicai` branch is checked out. + # The one that branched off from `git-sha-old` + cp llama_cpp_onicai_fork llama_cpp_onicai_fork_ + + # Clone the new version in place + git clone git@github.com:onicai/llama_cpp_onicai_fork.git + ``` -- from master, create a new branch: `onicai-` +- In llama_cpp_onicai_fork, from master, create a new branch: `onicai-` - For `git-sha`, use the short commit sha from which we're branching. + For `git-sha-new`, use the short commit sha from which we're branching. ## Update all files @@ -118,63 +133,84 @@ listed in [icpp.toml](https://github.com/onicai/llama_cpp_canister/blob/main/icp header files. As you do your upgrade, modify the descriptions below, to help with the next upgrade: -We use `meld` for comparing the files. +We use `meld` for comparing the files: + +```bash +brew install --cask dehesselle-meld +``` ### cpp_paths #### main_.cpp -`meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp` + +```bash +# from folder: llama_cpp_canister/src + +# To do the actual changes +meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp + +# To check what has changed between and +meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_/examples/main/main.cpp +``` - use `main_` instead of `main` -- A few items related to console & ctrl+C need to be outcommented +- A few items related to console, ctrl+C & threading need to be outcommented +- Added logic for running in a canister with multiple update calls #### llama_cpp_onicai_fork/src/llama.cpp +```bash +# from folder: llama_cpp_canister/src +# To do the actual changes +meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_/src/llama.cpp +``` - add `#include "ic_api.h"` -- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format` -- replace `throw` with `IC_API::trap` +- replace `throw std::runtime_error` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -- outcomment threading related items: - - `#include ` - - `#include ` - - `#include ` +- outcomment threading related items - outcomment these functions completely: - `llama_tensor_quantize_internal` - `llama_model_quantize_internal` #### llama_cpp_onicai_fork/src/llama-vocab.cpp +```bash +# from folder: llama_cpp_canister/src +meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_/src/llama-vocab.cpp +``` - add `#include "ic_api.h"` -- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format` +- replace `throw std::runtime_error` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -- add a check on `llama_token_bos(model)`, else the llama2.c models never stop generating: - ``` - bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { - return token != -1 && ( - token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) || - token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos - ); - } - ``` #### llama_cpp_onicai_fork/src/llama-grammar.cpp -No changes needed +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. #### llama_cpp_onicai_fork/src/llama-sampling.cpp -No changes needed +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` #### llama_cpp_onicai_fork/src/unicode-data.cpp - no modifications needed for the IC #### llama_cpp_onicai_fork/src/unicode.cpp - add `#include "ic_api.h"` -- replace `throw` with `IC_API::trap` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp +#### llama_cpp_onicai_fork/common/arg.cpp - add `#include "ic_api.h"` -- replace `throw` with `IC_API::trap` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` +- return dummy values (unreachable) after each IC_API::trap - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::out_of_range` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. #### llama_cpp_onicai_fork/common/build-info.cpp - run this command to create it: @@ -182,60 +218,194 @@ No changes needed make build-info-cpp-wasm ``` -#### llama_cpp_onicai_fork/common/grammar-parser.cpp -- add `#include "ic_api.h"` -- replace `throw` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - #### llama_cpp_onicai_fork/common/sampling.cpp - add `#include "ic_api.h"` -- replace `throw` with `IC_API::trap` +- replace `throw std::runtime_error` with `IC_API::trap` #### llama_cpp_onicai_fork/common/common.cpp -- add `#include "ic_api.h"` -- replace `throw` with `IC_API::trap` -- outcomment all code related to `` +- add right below `#include llama.h`: +```C++ +// ICPP-PATCH-START +#include "ic_api.h" +extern llama_model ** g_model; // The global variable from main_.cpp +// ICPP-PATCH-END +``` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - outcomment `std::getenv` + Compare to changes made last time (!) + +- outcomment all code related to ``: + Compare to changes made last time (!) + - cpu_get_num_physical_cores + +- outcomment #ifdef LLAMA_USE_CURL + Compare to changes made last time (!) +#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp +No updates needed for icpp-pro --- ### c_paths #### llama_cpp_onicai_fork/ggml/src/ggml.c -- outcomment all code related to signals +- outcomment all code related to signals & threading + - `#include "ggml-threading.h"` - `#include ` -- Many threading outcomments. -#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c -No updates needed for icpp-pro -#### llama_cpp_onicai_fork/ggml/src/ggml-backend.c +#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c No updates needed for icpp-pro #### llama_cpp_onicai_fork/ggml/src/ggml-quants.c No updates needed for icpp-pro -#### llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c -No updates needed for icpp-pro +#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp +- outcomment all code related to threading --- ### headers to modify -#### llama_cpp_onicai_fork/common/log.h -- `#include ` -- Some other threading code - -#### llama_cpp_onicai_fork/common/common.h -- `#include ` +#### llama_cpp_onicai_fork/common/chat-template.hpp +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. ## llama_cpp_onicai_fork: replace `onicai` branch +TODO: RETHINK THIS LOGIC... +(-) Perhaps it is better to keep all the `onicai-` branches +(-) And just change the default branch to `onicai-` + +That way: +(-) when someone clones, the are at the correct branch +(-) from the name, it is immediately clear what llama.cpp version was used +(-) we preserve the full history + +--- Do NOT merge the `onicai-` branch into the `onicai` branch, but replace it: ``` -git branch -m onicai onicai- -git branch -m onicai- onicai +git branch -m onicai onicai- +git branch -m onicai- onicai git push origin onicai:onicai -git push origin onicai-:onicai- -``` \ No newline at end of file +git push origin onicai-:onicai- +``` + + +------------ +TODO: search in code files for: TODO-615212 + +(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp` + This is from Google, and a general chat_template, with tool calling !!! + +(-) main_.cpp has a new static `global g_smpl`: + static common_sampler ** g_smpl; + + Q: Does this need to become a global variable, accessible from common.cpp ? + Like we did for g_model ? + + In `common/common.cpp` we added: + ``` + // ICPP-PATCH-START + #include "ic_api.h" + extern llama_model ** g_model; // The global variable from main_.cpp + // ICPP-PATCH-END + ``` + +(-) main_.cpp renamed type for `g_params`: + from: static gpt_params * g_params; + to : static common_params * g_params; + + Q: Does this need to become a global variable, accessible from common.cpp ? + Like we did for g_model ? + +(-) main_.cpp line 142: common_sampler * smpl = nullptr; + + Q: Does `smpl` need to become a static variable, like `model` & `ctx` ? + +(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going + + Q: Is this logic for prompt_remaining still valid? + +(-) main_.cpp line 208: // ICPP-TODO-START: This section is completely new... + COMPLETELY NEW SECTION FOR THREADPOOLs... + +(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT + -> LOG is used just for Console/Stream Output + -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes... + + Q1: Did we change anything to LOG & LOG_TEE to get it to work ? + Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it. + Q3: Can we remove the LOG & LOG_TEE + Q4: Do we need to update the README about downloading different LOG files? + +(-) main_.cpp calls common_token_to_piece instead of llama_token_to_piece + + Q: Is this a new file: common_token_to_piece + A: No, it is in common.cpp + +(-) main_.cpp calls common_tokenize instead of llama_tokenize + + Q: Is this a new file: common_tokenize + A: No, it is in common.cpp + +(-) main_.cpp line 516, 826: New sampling subsystem ! + + Q: Are these new files: + - common_sampler_init + - common_sampler_sample + - common_sampler_accept + A: No, it is in sampling.cpp + +(-) main_.cpp line 1123: common_sampler_free(smpl) + + We had outcommented code to NOT free the ctx & model storage: + // Do NOT free ctx & model storage + // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM + // -> we do NOT reset g_ctx & g_model + // -> we moved this into a free_model function, which can be called by canister's load_model + // llama_free(ctx); + // llama_free_model(model); + + // TODO-615212 -- Make sure this is correct + // Do reset all other static memory + reset_static_memory(); + + Q1: Has this all moved into common_sampler_free ? + + Q2: Update usage of the free_model function? + + Q3: is reset_static_memory still correct ? + + Q4: Is llama_sampling_free(ctx_sampling) now handled by common_sampler_free(smpl) ? + + +(-) llama-vocab.cpp --- This function is no longer there. Is tinystories still working? + + We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating: + ``` + bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { + return token != -1 && ( + token == llama_token_eos_impl(vocab) || + token == llama_token_eot_impl(vocab) || + token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos + ); + } + ``` +(-) NOTE: `common/grammar-parser.cpp` is no longer there. + It appears to be fully included in `src/llama-grammar.cpp` + +(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c` + +(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists + Previous update: No updates needed for icpp-pro + +(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time: + Previous update: + - `#include ` + - Some other threading code + +(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time: + Previous update: + - `#include ` \ No newline at end of file diff --git a/README.md b/README.md index 16a33a5..f373336 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ WARNING: Currently, the canister can only be build on a `Mac` ! cd src git clone git@github.com:onicai/llama_cpp_onicai_fork.git +TODO - DO WE STILL NEED THIS??? # Initialize the submodules of the llama_cpp_onicai_fork repo cd llama_cpp_onicai_fork git submodule init @@ -203,7 +204,7 @@ WARNING: Currently, the canister can only be build on a `Mac` ! # Remove the prompt cache when done - this keeps stable memory usage at a minimum dfx canister call llama_cpp remove_prompt_cache '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' - + ``` Note: The sequence of update calls to the canister is required because the Internet Computer has a limitation diff --git a/icpp.toml b/icpp.toml index 09b11e7..2e42b58 100644 --- a/icpp.toml +++ b/icpp.toml @@ -10,9 +10,10 @@ cpp_paths = [ "src/llama_cpp_onicai_fork/src/unicode.cpp", "src/llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp", "src/llama_cpp_onicai_fork/common/build-info.cpp", - "src/llama_cpp_onicai_fork/common/grammar-parser.cpp", "src/llama_cpp_onicai_fork/common/sampling.cpp", "src/llama_cpp_onicai_fork/common/common.cpp", + "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp", + "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp", "src/*.cpp", ] cpp_include_dirs = [ @@ -20,6 +21,7 @@ cpp_include_dirs = [ "src/llama_cpp_onicai_fork/include", "src/llama_cpp_onicai_fork/src", "src/llama_cpp_onicai_fork/ggml/include", + "src/llama_cpp_onicai_fork/ggml/src", "src/llama_cpp_onicai_fork/common", ] # NOTE: Adding compile flag "-msimd128" might be too much. It will compile everything with simd @@ -33,9 +35,7 @@ cpp_link_flags = [] c_paths = [ "src/llama_cpp_onicai_fork/ggml/src/ggml.c", "src/llama_cpp_onicai_fork/ggml/src/ggml-alloc.c", - "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.c", "src/llama_cpp_onicai_fork/ggml/src/ggml-quants.c", - "src/llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c", ] c_include_dirs = [ "src/llama_cpp_onicai_fork", diff --git a/src/main_.cpp b/src/main_.cpp index ae3b24d..f7e7f6a 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -361,10 +361,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { - embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + embd_inp.push_back(llama_vocab_bos(vocab)); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } else { - LOG_TEE("error: input is empty\n"); + LOG_ERR("input is empty\n"); return -1; } } @@ -719,7 +719,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-START // Keep track of the processed conversation tokens and the remaining prompt int id = embd[i]; - const std::string token_str = llama_token_to_piece(ctx, id, params.special); + const std::string token_str = common_token_to_piece(ctx, id, params.special); conversation_ss << token_str; // if (prompt_remaining.find(token_str) == 0) { @@ -1164,10 +1164,11 @@ void free_ctx() { } // TODO-615212 -- Make sure this is correct +// llama_model_free is a replacement for llama_free_model // LEAVE IT IN void free_model() { if (g_model && *g_model) { - llama_free_model(*g_model); + llama_model_free(*g_model); *g_model = nullptr; g_model = nullptr; } diff --git a/src/run.cpp b/src/run.cpp index dda1c0e..36e4c4a 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -7,6 +7,8 @@ #include "max_tokens.h" #include "utils.h" +#include "arg.h" + #include #include #include @@ -26,6 +28,11 @@ (-) run_update */ + +static void print_usage(int argc, char ** argv) { + // do nothing function +} + void new_chat() { IC_API ic_api(CanisterUpdate{std::string(__func__)}, false); std::string error_msg; @@ -43,8 +50,8 @@ void new_chat() { // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation // Get the cache filename from --prompt-cache in args - gpt_params params; - if (!gpt_params_parse(argc, argv.data(), params)) { + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) { error_msg = "Cannot parse args."; send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); @@ -131,8 +138,8 @@ void remove_prompt_cache() { auto [argc, argv, args] = get_args_for_main(ic_api); // Get the cache filename from --prompt-cache in args - gpt_params params; - if (!gpt_params_parse(argc, argv.data(), params)) { + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) { error_msg = "Cannot parse args."; send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); From 5cd89f3da15f73ce5d54b8f21c15f936d098cc58 Mon Sep 17 00:00:00 2001 From: icpp Date: Thu, 23 Jan 2025 21:01:27 -0500 Subject: [PATCH 03/25] format --- src/run.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/run.cpp b/src/run.cpp index 36e4c4a..a30991d 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -28,8 +28,7 @@ (-) run_update */ - -static void print_usage(int argc, char ** argv) { +static void print_usage(int argc, char **argv) { // do nothing function } @@ -51,7 +50,8 @@ void new_chat() { // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation // Get the cache filename from --prompt-cache in args common_params params; - if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) { + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { error_msg = "Cannot parse args."; send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); @@ -139,7 +139,8 @@ void remove_prompt_cache() { // Get the cache filename from --prompt-cache in args common_params params; - if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) { + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { error_msg = "Cannot parse args."; send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); From 5ea6d68d651b6a7e8f0fbe47de7a2f4eede27a93 Mon Sep 17 00:00:00 2001 From: icpp Date: Fri, 24 Jan 2025 12:45:13 -0500 Subject: [PATCH 04/25] mockic.exe builds !! --- README-contributors-guide.md | 69 +++++++++++++++++++++++++++++++++--- icpp.toml | 15 ++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 0c5f74e..0cf06d1 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -190,6 +190,16 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_ + #include + #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp No updates needed for icpp-pro @@ -264,6 +308,12 @@ No updates needed for icpp-pro #### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp - outcomment all code related to threading +#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/gguf.cpp +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + --- ### headers to modify @@ -296,9 +346,6 @@ git push origin onicai-:onicai- ------------ TODO: search in code files for: TODO-615212 -(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp` - This is from Google, and a general chat_template, with tool calling !!! - (-) main_.cpp has a new static `global g_smpl`: static common_sampler ** g_smpl; @@ -393,6 +440,20 @@ TODO: search in code files for: TODO-615212 ); } ``` + +(-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic + - verify the outcommented logic makes sense, or if we should just + completely remove the pause() & resume() functions. + +---------------------------------------------------------- +NOTES: + +(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp` + This is from Google, and a general chat_template, with tool calling !!! + +(-) All the LLM architectures supported by llama_cpp_canister are listed in + `src/llama_cpp_onicai_fork/src/llama-arch.cpp` + (-) NOTE: `common/grammar-parser.cpp` is no longer there. It appears to be fully included in `src/llama-grammar.cpp` diff --git a/icpp.toml b/icpp.toml index 2e42b58..b52633a 100644 --- a/icpp.toml +++ b/icpp.toml @@ -6,14 +6,29 @@ cpp_paths = [ "src/llama_cpp_onicai_fork/src/llama-vocab.cpp", "src/llama_cpp_onicai_fork/src/llama-grammar.cpp", "src/llama_cpp_onicai_fork/src/llama-sampling.cpp", + "src/llama_cpp_onicai_fork/src/llama-impl.cpp", + "src/llama_cpp_onicai_fork/src/llama-context.cpp", + "src/llama_cpp_onicai_fork/src/llama-arch.cpp", + "src/llama_cpp_onicai_fork/src/llama-kv-cache.cpp", + "src/llama_cpp_onicai_fork/src/llama-chat.cpp", + "src/llama_cpp_onicai_fork/src/llama-mmap.cpp", + "src/llama_cpp_onicai_fork/src/llama-model.cpp", + "src/llama_cpp_onicai_fork/src/llama-batch.cpp", + "src/llama_cpp_onicai_fork/src/llama-adapter.cpp", + "src/llama_cpp_onicai_fork/src/llama-model-loader.cpp", + "src/llama_cpp_onicai_fork/src/llama-hparams.cpp", "src/llama_cpp_onicai_fork/src/unicode-data.cpp", "src/llama_cpp_onicai_fork/src/unicode.cpp", + "src/llama_cpp_onicai_fork/common/arg.cpp", "src/llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp", "src/llama_cpp_onicai_fork/common/build-info.cpp", "src/llama_cpp_onicai_fork/common/sampling.cpp", "src/llama_cpp_onicai_fork/common/common.cpp", + "src/llama_cpp_onicai_fork/common/log.cpp", "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp", "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp", + "src/llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp", + "src/llama_cpp_onicai_fork/ggml/src/gguf.cpp", "src/*.cpp", ] cpp_include_dirs = [ From cc8123f66f14f095348c30aaf70742f30a9c21bb Mon Sep 17 00:00:00 2001 From: icpp Date: Sat, 25 Jan 2025 06:30:15 -0500 Subject: [PATCH 05/25] register CPU backend --- README-contributors-guide.md | 31 +++++++++++++++++++++++-------- icpp.toml | 18 ++++++++++++------ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 0cf06d1..d573c19 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -290,6 +290,21 @@ extern llama_model ** g_model; // The global variable from main_.cpp #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp No updates needed for icpp-pro +#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp +- outcomment all code related to threading + +#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/gguf.cpp +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +No updates needed for icpp-pro + --- ### c_paths @@ -305,14 +320,11 @@ No updates needed for icpp-pro #### llama_cpp_onicai_fork/ggml/src/ggml-quants.c No updates needed for icpp-pro -#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp -- outcomment all code related to threading - -#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c No updates needed for icpp-pro -#### llama_cpp_onicai_fork/ggml/src/gguf.cpp -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c +No updates needed for icpp-pro --- ### headers to modify @@ -442,9 +454,12 @@ TODO: search in code files for: TODO-615212 ``` (-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic - - verify the outcommented logic makes sense, or if we should just - completely remove the pause() & resume() functions. + - Remove the pause() function + - Remove the cur.is_end function ? +(-) Monitor memory, and make sure that ctx is freed up... + See free_ctx() method that has been outcommented in main_.cpp + ---------------------------------------------------------- NOTES: diff --git a/icpp.toml b/icpp.toml index b52633a..6daeacc 100644 --- a/icpp.toml +++ b/icpp.toml @@ -29,6 +29,8 @@ cpp_paths = [ "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp", "src/llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp", "src/llama_cpp_onicai_fork/ggml/src/gguf.cpp", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp", "src/*.cpp", ] cpp_include_dirs = [ @@ -38,6 +40,7 @@ cpp_include_dirs = [ "src/llama_cpp_onicai_fork/ggml/include", "src/llama_cpp_onicai_fork/ggml/src", "src/llama_cpp_onicai_fork/common", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu", ] # NOTE: Adding compile flag "-msimd128" might be too much. It will compile everything with simd # Alternative is to add it at granular level in the code, like: @@ -45,21 +48,24 @@ cpp_include_dirs = [ # void __attribute__((target("simd128"))) simd_function() { # // SIMD-specific code here # } -cpp_compile_flags = ["-DNDEBUG"] +cpp_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"] cpp_link_flags = [] c_paths = [ "src/llama_cpp_onicai_fork/ggml/src/ggml.c", "src/llama_cpp_onicai_fork/ggml/src/ggml-alloc.c", "src/llama_cpp_onicai_fork/ggml/src/ggml-quants.c", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c", ] c_include_dirs = [ "src/llama_cpp_onicai_fork", "src/llama_cpp_onicai_fork/include", - "src/llama_cpp_onicai_fork/ggml/src", - "src/llama_cpp_onicai_fork/ggml/include", "src/llama_cpp_onicai_fork/common", + "src/llama_cpp_onicai_fork/ggml/include", + "src/llama_cpp_onicai_fork/ggml/src", + "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu", ] -c_compile_flags = ["-DNDEBUG", "-msimd128"] +c_compile_flags = ["-DNDEBUG", "-msimd128", "-DGGML_USE_CPU"] post_wasm_function = "scripts.optimize_wasm.main" [build-native] @@ -72,8 +78,8 @@ cpp_paths = [ # "src/llama_cpp_onicai_fork/common/console.cpp", ] cpp_include_dirs = [] -cpp_compile_flags = ["-DNDEBUG"] +cpp_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"] cpp_link_flags = [] c_paths = [] c_include_dirs = [] -c_compile_flags = ["-DNDEBUG"] +c_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"] From 47d0a555653fd355c1bcfc9d14b96c931a6909ab Mon Sep 17 00:00:00 2001 From: icpp Date: Mon, 27 Jan 2025 11:28:38 -0500 Subject: [PATCH 06/25] Update memory management for model into Orthogonal Persistence --- README-contributors-guide.md | 50 +++++++++++++++++++--- src/main_.cpp | 81 ++++++++++++++---------------------- src/main_.h | 4 +- src/model.cpp | 45 ++++++++++++++------ src/run.cpp | 14 +++++++ 5 files changed, 124 insertions(+), 70 deletions(-) diff --git a/README-contributors-guide.md b/README-contributors-guide.md index d573c19..171eaf9 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -263,12 +263,40 @@ make build-info-cpp-wasm #### llama_cpp_onicai_fork/common/common.cpp - add right below `#include llama.h`: -```C++ -// ICPP-PATCH-START -#include "ic_api.h" -extern llama_model ** g_model; // The global variable from main_.cpp -// ICPP-PATCH-END -``` + ```C++ + // ICPP-PATCH-START + #include "ic_api.h" + extern llama_model ** g_model; // The global variable from main_.cpp + // ICPP-PATCH-END + ``` +- In common_init_result, skip loading the model if the --model parameter is not provided: + ```C++ + // ICPP-PATCH-START + // Skip loading the model if the --model parameter is not provided + if (!params.model.empty()) { + // ICPP-PATCH-END + + ... + model = ... + ... + + // ICPP-PATCH-START + // Skip loading the model if the --model parameter is not provided + } else { + // Access the model through g_model and assign it to the local variable + model = *g_model; + } + // ICPP-PATCH-END + ``` +- In common_init_result, do NOT transfer ownership of the model pointer: + ```C++ + // ICPP-PATCH-START: + // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model + // We do NOT want the model to be freed when the unique_ptr goes out of scope + // iparams.model.reset(model); + // ICPP-PATCH-END + ``` + - replace `throw std::runtime_error` with `IC_API::trap` - replace `throw std::invalid_argument` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. @@ -329,6 +357,16 @@ No updates needed for icpp-pro --- ### headers to modify +#### llama_cpp_onicai_fork/common/common.h +- Modify this: +``` +// ICPP-PATCH-START +// We do NOT load a default model into the canister +// #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +#define DEFAULT_MODEL_PATH "" +// ICPP-PATCH-END +``` + #### llama_cpp_onicai_fork/common/chat-template.hpp - replace `throw std::runtime_error` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. diff --git a/src/main_.cpp b/src/main_.cpp index f7e7f6a..1b87dca 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -156,7 +156,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only llama_numa_init(params.numa); static llama_model * model; // ICPP-PATCH: use static to preserve accross calls - static llama_context * ctx; // ICPP-PATCH: use static to preserve accross calls + llama_context * ctx; common_sampler * smpl = nullptr; // ICPP-PATCH-START @@ -166,14 +166,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // Keep track of the prompt portion not yet processed prompt_remaining.clear(); - // Skip loading the model if the --model parameter is not provided - // if (!params.model.empty()) { // TODO: REMOVE THIS: WE MOVED THIS CHECK INTO llama_init_from_gpt_params - free_ctx(); - if (!params.model.empty()) { - free_model(); - } - // ICPP-PATCH-END - g_model = &model; g_ctx = &ctx; g_smpl = &smpl; @@ -194,17 +186,23 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-END return 1; } + // ICPP-PATCH-START - // Skip loading the model if the --model parameter is not provided - // } // TODO: REMOVE THIS: WE MOVED THIS CHECK INTO llama_init_from_gpt_params - // And return if we are asked to ONLY load the model + // Transfer the ownership of the model pointer. so it persists across calls in Orthogonal Persistence. + // We manually take control over the memory management of the model pointer, using icpp_free_model() to free it. + // NOTE: The release() method of std::unique_ptr relinquishes ownership of the managed + // object and returns the raw pointer to it. + // After the call to release(), the std::unique_ptr becomes empty + // (i.e., it no longer manages any object). + model = llama_init.model.release(); + + // Return if we are asked to ONLY load the model if (load_model_only) { return 0; } // ICPP-PATCH-END - // ICPP-TODO-START: This section is completely new... const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_from_model(model, params.chat_template); @@ -240,7 +238,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } llama_attach_threadpool(ctx, threadpool, threadpool_batch); - // ICPP-TODO-END const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); @@ -1124,24 +1121,12 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-START - // TODO-615212 -- This is old code that we had outcommented - // REMOVE - // Do NOT free ctx & model storage - // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM - // -> we do NOT reset g_ctx & g_model - // -> we moved this into a free_model function, which can be called by canister's load_model - // llama_free(ctx); - // llama_free_model(model); - // TODO-615212 -- Make sure this is correct // LEAVE IT IN // Do reset all other static memory reset_static_memory(); // ICPP-PATCH-END - // TODO-615212 -- Make sure this is now handled in common_sampler_free - // REMOVE - // llama_sampling_free(ctx_sampling); llama_backend_free(); ggml_threadpool_free_fn(threadpool); @@ -1153,42 +1138,40 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // ICPP-PATCH-START: // functions added for running on IC -// TODO-615212 -- Make sure this is now handled in common_sampler_free -// REMOVE -void free_ctx() { - if (g_ctx && *g_ctx) { - llama_free(*g_ctx); - *g_ctx = nullptr; - g_ctx = nullptr; - } -} - -// TODO-615212 -- Make sure this is correct -// llama_model_free is a replacement for llama_free_model -// LEAVE IT IN -void free_model() { +// Function to be called by the canister to free the model which is persisted in Orthogonal Persisted memory +void icpp_free_model() { if (g_model && *g_model) { llama_model_free(*g_model); *g_model = nullptr; g_model = nullptr; } } -// TODO-615212 -- Make sure this is correct -// LEAVE IT IN + void reset_static_memory() { - // Tip: to find what must be reset, use a native debug build and stop here - // in vscode. Then check the static memory section in VARIABLES. + /* Tip: to find what must be reset, use a native debug build and stop here + in lldb: + + lldb ./build-native/mockic.exe + (lldb) breakpoint set --name reset_static_memory + (lldb) run + (lldb) target variable + */ // Avoid dangling pointers in static memory // -> The data pointed to is re-created each call - // -> The data pointed to is cleared automatic, because it is non-static - g_output_tokens = nullptr; + // -> The data pointed to is cleared automatic, because: + // (-) it is a smart pointer (std::unique_ptr) + // (-) it is non-static + + g_ctx = nullptr; + g_smpl = nullptr; g_params = nullptr; - g_input_tokens = nullptr; g_output_ss = nullptr; - + g_output_tokens = nullptr; + g_input_tokens = nullptr; + // Do not carry over any other values in static memory - is_interacting = false; need_insert_eot = false; + is_interacting = false; } // ICPP-PATCH-END diff --git a/src/main_.h b/src/main_.h index bb36c28..03e4494 100644 --- a/src/main_.h +++ b/src/main_.h @@ -6,6 +6,6 @@ int main_(int argc, char **argv, std::string principal_id, bool load_model_only, std::string &icpp_error_msg, std::ostringstream &conversation_ss, std::ostringstream &output_ss, const uint64_t &max_tokens, std::string &prompt_remaining, bool &generated_eog); -void free_ctx(); -void free_model(); + +void icpp_free_model(); void reset_static_memory(); \ No newline at end of file diff --git a/src/model.cpp b/src/model.cpp index 88bf04c..db259ee 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -9,11 +9,18 @@ #include "upload.h" #include "utils.h" +#include "common.h" +#include "arg.h" + #include #include #include "ic_api.h" +static void print_usage(int argc, char **argv) { + // do nothing function +} + void load_model() { IC_API ic_api(CanisterUpdate{std::string(__func__)}, false); if (!is_caller_a_controller(ic_api)) return; @@ -21,14 +28,33 @@ void load_model() { CandidTypePrincipal caller = ic_api.get_caller(); std::string principal_id = caller.get_text(); + std::string error_msg; + // Get the data from the wire and prepare arguments for main_ auto [argc, argv, args] = get_args_for_main(ic_api); - // Lets go. - ready_for_inference = true; + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { + error_msg = "Cannot parse args."; + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, error_msg); + return; + } + + if (!params.model.empty()) { + // We're going to load a new model, first free the Orthogonally Persisted memory of a previously loaded model + icpp_free_model(); + } else { + error_msg = "--model not provided in args. Do not know what model to load."; + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, error_msg); + return; + } + - // First free the OP memory of a previously loaded model - free_model(); + // First free the Orthogonally Persisted memory of a previously loaded model + icpp_free_model(); // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; @@ -43,15 +69,8 @@ void load_model() { // Exit if there was an error if (result != 0) { - CandidTypeRecord r_out; - r_out.append("status_code", - CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500 - r_out.append("conversation", CandidTypeText{""}); - r_out.append("output", CandidTypeText{""}); - r_out.append("error", CandidTypeText{icpp_error_msg}); - r_out.append("prompt_remaining", CandidTypeText{""}); - r_out.append("generated_eog", CandidTypeBool{generated_eog}); - ic_api.to_wire(CandidTypeVariant{"Err", r_out}); + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, icpp_error_msg); return; } diff --git a/src/run.cpp b/src/run.cpp index a30991d..25e0e14 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -222,6 +222,20 @@ void run(IC_API &ic_api, const uint64_t &max_tokens) { // Get the data from the wire and prepare arguments for main_ auto [argc, argv, args] = get_args_for_main(ic_api); + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { + error_msg = "Cannot parse args."; + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, error_msg); + return; + } + + // If we're going to load a new model, first free the Orthogonally Persisted memory of a previously loaded model + if (!params.model.empty()) { + icpp_free_model(); + } + // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; std::ostringstream From 23bbf40bcf1fa9de85a2580cf8738c8921220261 Mon Sep 17 00:00:00 2001 From: icpp Date: Mon, 27 Jan 2025 11:28:55 -0500 Subject: [PATCH 07/25] Format --- src/model.cpp | 3 +-- src/run.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index db259ee..540c047 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -9,8 +9,8 @@ #include "upload.h" #include "utils.h" -#include "common.h" #include "arg.h" +#include "common.h" #include #include @@ -52,7 +52,6 @@ void load_model() { return; } - // First free the Orthogonally Persisted memory of a previously loaded model icpp_free_model(); diff --git a/src/run.cpp b/src/run.cpp index 25e0e14..79fa8d8 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -235,7 +235,7 @@ void run(IC_API &ic_api, const uint64_t &max_tokens) { if (!params.model.empty()) { icpp_free_model(); } - + // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; std::ostringstream From 7b79e6f0179e4911f163c2cb4fd0165f5c95b6eb Mon Sep 17 00:00:00 2001 From: icpp Date: Mon, 27 Jan 2025 12:48:49 -0500 Subject: [PATCH 08/25] Default behavior: -no-cnv --- src/main_.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main_.cpp b/src/main_.cpp index 1b87dca..e7c97ba 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -250,8 +250,13 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default; if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { if (has_chat_template) { - LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + // ICPP-PATCH-START + // conversation mode is not supported in a canister. Do not turn it on by default. + // LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); + // params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + LOG_INF("%s: chat template is available, but since canisters do not support conversation mode, we use -no-cnv by default.)\n", __func__); + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + // ICPP-PATCH-END } else { params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; } From 528b5eda86d9c2cc8e5553e408762d96cfbff5b3 Mon Sep 17 00:00:00 2001 From: icpp Date: Mon, 27 Jan 2025 12:49:53 -0500 Subject: [PATCH 09/25] Upgrade to icpp-pro 5.0.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4af6c53..cf4595d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ -r scripts/requirements.txt -r src/llama_cpp_onicai_fork/requirements.txt -icpp-pro>=5.0.1 +icpp-pro>=5.0.2 ic-py==1.0.1 binaryen.py \ No newline at end of file From 2be9f53249cf9d90efc8afe328a2e79f5931e138 Mon Sep 17 00:00:00 2001 From: icpp Date: Tue, 28 Jan 2025 06:36:24 -0500 Subject: [PATCH 10/25] wasm now builds --- README-contributors-guide.md | 76 ++++++++++++++++++++++++++++++++++-- README.md | 6 --- src/main_.cpp | 5 ++- 3 files changed, 76 insertions(+), 11 deletions(-) diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 171eaf9..48d98af 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -235,6 +235,14 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_>> validation_result; + // ICPP-PATCH-END + ... several other references to validation_result + ``` #### llama_cpp_onicai_fork/src/llama-hparams.cpp - no modifications needed for the IC @@ -244,6 +252,11 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_ @@ -322,13 +336,30 @@ No updates needed for icpp-pro - outcomment all code related to threading #### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp -No updates needed for icpp-pro +- Update dl_handle_deleter, to avoid a call to dlclose that should never happen + The linker ends up with undefined if we don't outcomment it + ```C++ + #include "ic_api.h" + struct dl_handle_deleter { + void operator()(void * handle) { + // ICPP-PATCH-START + // We are NOT dynamically loading any backend + // SO WE SHOULD NEVER GET HERE + // Avoid linker error by outcommenting this, but inserting a runtime trap + // dlclose(handle); + IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called"); + // ICPP-PATCH-END + } + }; + ``` #### llama_cpp_onicai_fork/ggml/src/gguf.cpp - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. #### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp -No updates needed for icpp-pro +- outcomment all code related to signals & threading: + - `#include "ggml-threading.h"` + - `#include ` #### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp No updates needed for icpp-pro @@ -357,6 +388,40 @@ No updates needed for icpp-pro --- ### headers to modify +#### llama_cpp_onicai_fork/src/llama-model-loader.h +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### llama_cpp_onicai_fork/src/minja.hpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- re-define two functions: + ```C++ + // ICPP-PATCH-START + // throw not supported, using IC_API::trap instead, which expects a string + // std::runtime_error unexpected(const TemplateToken & token) const { + // return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type) + // + error_location_suffix(*template_str, token.location.pos)); + // } + // std::runtime_error unterminated(const TemplateToken & token) const { + // return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type) + // + error_location_suffix(*template_str, token.location.pos)); + // } + std::string unexpected(const TemplateToken & token) const { + return ("Unexpected " + TemplateToken::typeToString(token.type) + + error_location_suffix(*template_str, token.location.pos)); + } + std::string unterminated(const TemplateToken & token) const { + return ("Unterminated " + TemplateToken::typeToString(token.type) + + error_location_suffix(*template_str, token.location.pos)); + } + // ICPP-PATCH-END + ``` +- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))` +- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))` +- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))` +- outcomment try-catch + #### llama_cpp_onicai_fork/common/common.h - Modify this: ``` @@ -371,6 +436,9 @@ No updates needed for icpp-pro - replace `throw std::runtime_error` with `IC_API::trap` - outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +#### llama_cpp_onicai_fork/ggml/include/ggml.h +- #define GGML_DEFAULT_N_THREADS 1 + ## llama_cpp_onicai_fork: replace `onicai` branch TODO: RETHINK THIS LOGIC... diff --git a/README.md b/README.md index f373336..bcd0397 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,6 @@ WARNING: Currently, the canister can only be build on a `Mac` ! # Into the ./src folder cd src git clone git@github.com:onicai/llama_cpp_onicai_fork.git - -TODO - DO WE STILL NEED THIS??? - # Initialize the submodules of the llama_cpp_onicai_fork repo - cd llama_cpp_onicai_fork - git submodule init - git submodule update ``` - Create the file src/llama_cpp_onicai_fork/common/build-info.cpp diff --git a/src/main_.cpp b/src/main_.cpp index e7c97ba..45815d7 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -217,7 +217,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only struct ggml_threadpool_params tpp = ggml_threadpool_params_from_cpu_params(params.cpuparams); - set_process_priority(params.cpuparams.priority); + // ICPP-PATCH-START + // This is not supported in a canister + // set_process_priority(params.cpuparams.priority); + // ICPP-PATCH-END struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { From d5a48a697b1ff891d15c58d3eee615805e13d914 Mon Sep 17 00:00:00 2001 From: icpp Date: Tue, 28 Jan 2025 06:42:33 -0500 Subject: [PATCH 11/25] free model only once --- src/model.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index 540c047..ddc9ec1 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -52,9 +52,6 @@ void load_model() { return; } - // First free the Orthogonally Persisted memory of a previously loaded model - icpp_free_model(); - // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; std::ostringstream conversation_ss; From 063b605b32198e22937c6c82d6f1af21e515de8e Mon Sep 17 00:00:00 2001 From: icpp Date: Tue, 28 Jan 2025 09:51:20 -0500 Subject: [PATCH 12/25] Scripts to build & deploy --- dfx.json | 10 +++++ scripts/0-all.sh | 7 +++ scripts/1-build.sh | 8 ++++ scripts/2-deploy-reinstall.sh | 77 ++++++++++++++++++++++++++++++++ scripts/2-deploy-upgrade.sh | 77 ++++++++++++++++++++++++++++++++ scripts/3-upload-model.sh | 82 ++++++++++++++++++++++++++++++++++ scripts/4-load-model.sh | 78 ++++++++++++++++++++++++++++++++ scripts/5-set-max-tokens.sh | 84 +++++++++++++++++++++++++++++++++++ 8 files changed, 423 insertions(+) create mode 100755 scripts/0-all.sh create mode 100755 scripts/1-build.sh create mode 100755 scripts/2-deploy-reinstall.sh create mode 100755 scripts/2-deploy-upgrade.sh create mode 100755 scripts/3-upload-model.sh create mode 100755 scripts/4-load-model.sh create mode 100755 scripts/5-set-max-tokens.sh diff --git a/dfx.json b/dfx.json index 03999f0..dddeeb0 100644 --- a/dfx.json +++ b/dfx.json @@ -5,6 +5,16 @@ "type": "custom", "candid": "src/llama_cpp.did", "wasm": "build/llama_cpp.wasm" + }, + "llm_0": { + "type": "custom", + "candid": "src/llama_cpp.did", + "wasm": "build/llama_cpp.wasm" + }, + "llm_1": { + "type": "custom", + "candid": "src/llama_cpp.did", + "wasm": "build/llama_cpp.wasm" } }, "defaults": { diff --git a/scripts/0-all.sh b/scripts/0-all.sh new file mode 100755 index 0000000..76cb641 --- /dev/null +++ b/scripts/0-all.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +scripts/1-build.sh +scripts/2-deploy-reinstall.sh +scripts/3-upload-model.sh +scripts/4-load-model.sh +scripts/5-set-max-tokens.sh \ No newline at end of file diff --git a/scripts/1-build.sh b/scripts/1-build.sh new file mode 100755 index 0000000..00b377c --- /dev/null +++ b/scripts/1-build.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo " " +echo "--------------------------------------------------" +echo "Building the wasm for llama_cpp_canister" +make build-info-cpp-wasm +# icpp build-wasm +icpp build-wasm --to-compile mine-no-lib \ No newline at end of file diff --git a/scripts/2-deploy-reinstall.sh b/scripts/2-deploy-reinstall.sh new file mode 100755 index 0000000..926062f --- /dev/null +++ b/scripts/2-deploy-reinstall.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/deploy-reinstall.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +NUM_LLMS_DEPLOYED=1 + +# When deploying to IC, we deploy to a specific subnet +# none will not use subnet parameter in deploy to ic +SUBNET="none" +# SUBNET="-------" + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +####################################################################### +echo " " +echo "===================================================" +echo "Deploying $NUM_LLMS_DEPLOYED llms to subnet $SUBNET" +llm_id_start=0 +llm_id_end=$((NUM_LLMS_DEPLOYED - 1)) + +for i in $(seq $llm_id_start $llm_id_end) +do + echo "--------------------------------------------------" + echo "Deploying the wasm to llm_$i" + if [ "$NETWORK_TYPE" = "ic" ]; then + if [ "$SUBNET" = "none" ]; then + yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE + else + yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE --subnet $SUBNET + fi + else + yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE + fi + + echo " " + echo "--------------------------------------------------" + echo "Checking health endpoint for llm_$i" + output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i health check failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i health check succeeded." + echo πŸŽ‰ + fi +done \ No newline at end of file diff --git a/scripts/2-deploy-upgrade.sh b/scripts/2-deploy-upgrade.sh new file mode 100755 index 0000000..9c3aec0 --- /dev/null +++ b/scripts/2-deploy-upgrade.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/deploy-upgrade.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +NUM_LLMS_DEPLOYED=1 + +# When deploying to IC, we deploy to a specific subnet +# none will not use subnet parameter in deploy to ic +SUBNET="none" +# SUBNET="-------" + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +####################################################################### +echo " " +echo "===================================================" +echo "Deploying $NUM_LLMS_DEPLOYED llms to subnet $SUBNET" +llm_id_start=0 +llm_id_end=$((NUM_LLMS_DEPLOYED - 1)) + +for i in $(seq $llm_id_start $llm_id_end) +do + echo "--------------------------------------------------" + echo "Deploying the wasm to llm_$i" + if [ "$NETWORK_TYPE" = "ic" ]; then + if [ "$SUBNET" = "none" ]; then + yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE + else + yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE --subnet $SUBNET + fi + else + yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE + fi + + echo " " + echo "--------------------------------------------------" + echo "Checking health endpoint for llm_$i" + output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i health check failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i health check succeeded." + echo πŸŽ‰ + fi +done \ No newline at end of file diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh new file mode 100755 index 0000000..1a46c0a --- /dev/null +++ b/scripts/3-upload-model.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +LLAMA_CPP_CANISTER_PATH="./" +export PYTHONPATH="${PYTHONPATH}:$(realpath $LLAMA_CPP_CANISTER_PATH)" + +####################################################################### +# run from parent folder as: +# scripts/upload-model.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +NUM_LLMS_DEPLOYED=1 + +# The gguf model file to upload (Relative to llama_cpp_canister folder) +MODEL="models/stories260Ktok512.gguf" +# MODEL="models/stories15Mtok4096.gguf" +# MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf" +# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf" + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +####################################################################### +echo " " +echo "===================================================" +echo "Uploading model for $NUM_LLMS_DEPLOYED llms" +llm_id_start=0 +llm_id_end=$((NUM_LLMS_DEPLOYED - 1)) + +for i in $(seq $llm_id_start $llm_id_end) +do + echo " " + echo "--------------------------------------------------" + echo "Checking health endpoint for llm_$i" + output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i health check failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i health check succeeded." + fi + + echo " " + echo "--------------------------------------------------" + echo "Upload the model ($MODEL) to llm_$i" + python -m scripts.upload --network $NETWORK_TYPE --canister llm_$i --canister-filename models/model.gguf $MODEL + + if [ $? -ne 0 ]; then + echo "scripts.upload for llm_$i exited with an error. Exiting the bash script." + echo $? + echo "****************************************************************" + echo "scripts.upload for llm_$i exited with an error. Exiting the bash script." + echo "****************************************************************" + exit 1 + fi +done \ No newline at end of file diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh new file mode 100755 index 0000000..3b65267 --- /dev/null +++ b/scripts/4-load-model.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/load-model.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +NUM_LLMS_DEPLOYED=1 + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +####################################################################### +echo " " +echo "===================================================" +echo "Loading model for $NUM_LLMS_DEPLOYED llms" +llm_id_start=0 +llm_id_end=$((NUM_LLMS_DEPLOYED - 1)) + +for i in $(seq $llm_id_start $llm_id_end) +do + echo " " + echo "--------------------------------------------------" + echo "Checking health endpoint for llm_$i" + output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i health check failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i health check succeeded." + fi + + echo " " + echo "--------------------------------------------------" + echo "Calling load_model for llm_$i" + output=$(dfx canister call llm_$i load_model \ + '(record { args = vec {"--model"; "models/model.gguf";} })' \ + --network "$NETWORK_TYPE") + + if ! echo "$output" | grep -q " Ok "; then + echo "llm_$i load_model failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i load_model failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i load_model succeeded." + echo πŸŽ‰ + fi +done \ No newline at end of file diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh new file mode 100755 index 0000000..d85659c --- /dev/null +++ b/scripts/5-set-max-tokens.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/load-model.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +NUM_LLMS_DEPLOYED=1 + +MAX_TOKENS=128 # stories260Ktok512.gguf +# MAX_TOKENS=60 # stories15Mtok4096.gguf +# MAX_TOKENS=25 # SmolLM2-135M-Instruct-Q4_K_M.gguf +# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +####################################################################### +echo " " +echo "===================================================" +echo "set_max_tokens to $MAX_TOKENS for $NUM_LLMS_DEPLOYED llms" +llm_id_start=0 +llm_id_end=$((NUM_LLMS_DEPLOYED - 1)) + +for i in $(seq $llm_id_start $llm_id_end) +do + echo " " + echo "--------------------------------------------------" + echo "Checking health endpoint for llm_$i" + output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i health check failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i health check succeeded." + fi + + echo " " + echo "--------------------------------------------------" + echo "Setting max tokens to ($MAX_TOKENS) for llm_$i" + output=$(dfx canister call llm_$i set_max_tokens \ + '(record { max_tokens_query = '"$MAX_TOKENS"' : nat64; max_tokens_update = '"$MAX_TOKENS"' : nat64 })' \ + --network "$NETWORK_TYPE") + + + if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i set_max_tokens failed. Exiting." + echo $output + echo "****************************************************************" + echo "llm_$i set_max_tokens to $MAX_TOKENS failed. Exiting." + echo "****************************************************************" + exit 1 + else + echo "llm_$i set_max_tokens to $MAX_TOKENS succeeded." + echo πŸŽ‰ + fi +done \ No newline at end of file From a663315ed8c94e68f4630309b98f54b2386e1f51 Mon Sep 17 00:00:00 2001 From: icpp Date: Tue, 28 Jan 2025 13:29:36 -0500 Subject: [PATCH 13/25] tinystories is working in canister! --- README-contributors-guide.md | 28 ++++++++++++++----- scripts/2-deploy-reinstall.sh | 7 ++--- scripts/2-deploy-upgrade.sh | 5 +--- scripts/3-upload-model.sh | 15 ++++------ scripts/4-load-model.sh | 10 ++----- scripts/5-set-max-tokens.sh | 16 ++++------- scripts/6-a-test-new-chat.sh | 51 ++++++++++++++++++++++++++++++++++ scripts/6-b-test-run-update.sh | 38 +++++++++++++++++++++++++ scripts/6-c-test-run-update.sh | 38 +++++++++++++++++++++++++ 9 files changed, 163 insertions(+), 45 deletions(-) create mode 100755 scripts/6-a-test-new-chat.sh create mode 100755 scripts/6-b-test-run-update.sh create mode 100755 scripts/6-c-test-run-update.sh diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 48d98af..eec4edc 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -243,6 +243,7 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_ #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp -No updates needed for icpp-pro +- outcomment all uses of `getenv`: + ```C++ + // ICPP-PATCH-START + // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG"); + // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0; + sched->debug = 0; + // ICPP-PATCH-END + ``` #### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp - outcomment all code related to threading @@ -423,13 +432,18 @@ No updates needed for icpp-pro - outcomment try-catch #### llama_cpp_onicai_fork/common/common.h -- Modify this: +- Modify these: ``` -// ICPP-PATCH-START -// We do NOT load a default model into the canister -// #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" -#define DEFAULT_MODEL_PATH "" -// ICPP-PATCH-END + // ICPP-PATCH-START + // bool use_mmap = true; // use mmap for faster loads + bool use_mmap = false; // not in a canister... + // ICPP-PATCH-END + + // ICPP-PATCH-START + // We do NOT load a default model into the canister + // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" + #define DEFAULT_MODEL_PATH "" + // ICPP-PATCH-END ``` #### llama_cpp_onicai_fork/common/chat-template.hpp diff --git a/scripts/2-deploy-reinstall.sh b/scripts/2-deploy-reinstall.sh index 926062f..fe8105d 100755 --- a/scripts/2-deploy-reinstall.sh +++ b/scripts/2-deploy-reinstall.sh @@ -64,11 +64,8 @@ do output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i health check failed. Exiting." - echo $output - echo "****************************************************************" - echo "llm_$i health check failed. Exiting." - echo "****************************************************************" + echo "llm_$i health check failed." + echo $output exit 1 else echo "llm_$i health check succeeded." diff --git a/scripts/2-deploy-upgrade.sh b/scripts/2-deploy-upgrade.sh index 9c3aec0..04e5297 100755 --- a/scripts/2-deploy-upgrade.sh +++ b/scripts/2-deploy-upgrade.sh @@ -64,11 +64,8 @@ do output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i health check failed. Exiting." + echo "llm_$i health check failed." echo $output - echo "****************************************************************" - echo "llm_$i health check failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i health check succeeded." diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh index 1a46c0a..5787ab7 100755 --- a/scripts/3-upload-model.sh +++ b/scripts/3-upload-model.sh @@ -13,10 +13,11 @@ NETWORK_TYPE="local" NUM_LLMS_DEPLOYED=1 # The gguf model file to upload (Relative to llama_cpp_canister folder) -MODEL="models/stories260Ktok512.gguf" +# MODEL="models/stories260Ktok512.gguf" # MODEL="models/stories15Mtok4096.gguf" # MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf" -# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf" +MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf" +# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf" # Parse command line arguments for network type while [ $# -gt 0 ]; do @@ -56,11 +57,8 @@ do output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i health check failed. Exiting." + echo "llm_$i health check failed." echo $output - echo "****************************************************************" - echo "llm_$i health check failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i health check succeeded." @@ -72,11 +70,8 @@ do python -m scripts.upload --network $NETWORK_TYPE --canister llm_$i --canister-filename models/model.gguf $MODEL if [ $? -ne 0 ]; then - echo "scripts.upload for llm_$i exited with an error. Exiting the bash script." + echo "scripts.upload for llm_$i exited with an error." echo $? - echo "****************************************************************" - echo "scripts.upload for llm_$i exited with an error. Exiting the bash script." - echo "****************************************************************" exit 1 fi done \ No newline at end of file diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh index 3b65267..67cd9d0 100755 --- a/scripts/4-load-model.sh +++ b/scripts/4-load-model.sh @@ -47,11 +47,8 @@ do output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i health check failed. Exiting." + echo "llm_$i health check failed." echo $output - echo "****************************************************************" - echo "llm_$i health check failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i health check succeeded." @@ -65,11 +62,8 @@ do --network "$NETWORK_TYPE") if ! echo "$output" | grep -q " Ok "; then - echo "llm_$i load_model failed. Exiting." + echo "llm_$i load_model failed." echo $output - echo "****************************************************************" - echo "llm_$i load_model failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i load_model succeeded." diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh index d85659c..f3ee2e1 100755 --- a/scripts/5-set-max-tokens.sh +++ b/scripts/5-set-max-tokens.sh @@ -9,10 +9,10 @@ NETWORK_TYPE="local" NUM_LLMS_DEPLOYED=1 -MAX_TOKENS=128 # stories260Ktok512.gguf +# MAX_TOKENS=128 # stories260Ktok512.gguf # MAX_TOKENS=60 # stories15Mtok4096.gguf -# MAX_TOKENS=25 # SmolLM2-135M-Instruct-Q4_K_M.gguf -# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf +# MAX_TOKENS=20 # SmolLM2-135M-Instruct-Q4_K_M.gguf +MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf # Parse command line arguments for network type while [ $# -gt 0 ]; do @@ -52,11 +52,8 @@ do output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i health check failed. Exiting." + echo "llm_$i health check failed" echo $output - echo "****************************************************************" - echo "llm_$i health check failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i health check succeeded." @@ -71,11 +68,8 @@ do if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then - echo "llm_$i set_max_tokens failed. Exiting." + echo "llm_$i set_max_tokens failed." echo $output - echo "****************************************************************" - echo "llm_$i set_max_tokens to $MAX_TOKENS failed. Exiting." - echo "****************************************************************" exit 1 else echo "llm_$i set_max_tokens to $MAX_TOKENS succeeded." diff --git a/scripts/6-a-test-new-chat.sh b/scripts/6-a-test-new-chat.sh new file mode 100755 index 0000000..0be92dc --- /dev/null +++ b/scripts/6-a-test-new-chat.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/test.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +i=0 # llm_$i will be tested + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +echo " " +echo "--------------------------------------------------" +echo "Checking health endpoint for llm_$i" +output=$(dfx canister call llm_$i health --network $NETWORK_TYPE ) + +if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then + echo "llm_$i health check failed." + echo $output + exit 1 +else + echo "llm_$i health check succeeded." +fi + +echo " " +echo "--------------------------------------------------" +echo "Calling new_chat for llm_$i" +dfx canister call llm_$i new_chat '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' --network $NETWORK_TYPE \ No newline at end of file diff --git a/scripts/6-b-test-run-update.sh b/scripts/6-b-test-run-update.sh new file mode 100755 index 0000000..a0ca4a5 --- /dev/null +++ b/scripts/6-b-test-run-update.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/test.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +i=0 # llm_$i will be tested + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +echo " " +echo "--------------------------------------------------" +echo "Calling run_update for llm_$i" +dfx canister call llm_$i run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is the difference between a chicken and a turkey.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' diff --git a/scripts/6-c-test-run-update.sh b/scripts/6-c-test-run-update.sh new file mode 100755 index 0000000..d6a1cd2 --- /dev/null +++ b/scripts/6-c-test-run-update.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/test.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +i=0 # llm_$i will be tested + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +echo " " +echo "--------------------------------------------------" +echo "Calling run_update for llm_$i" +dfx canister call llm_$i run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })' From cc5a3266c0fb759dc80a08e88bf8cd7476751be8 Mon Sep 17 00:00:00 2001 From: icpp Date: Wed, 29 Jan 2025 15:58:58 -0500 Subject: [PATCH 14/25] Some small updates --- README-0001-b841d0.md | 110 ++++ README-0002-615212.md | 401 +++++++++++++ README-contributors-guide.md | 542 +----------------- README.md | 76 +-- dfx.json | 10 - dfx.multiple-llms.json | 21 + scripts/3-upload-model.sh | 5 +- scripts/4-load-model.sh | 2 +- scripts/5-set-max-tokens.sh | 3 +- .../{6-a-test-new-chat.sh => 6-new-chat.sh} | 0 scripts/7-deepseek-run-update-a.sh | 39 ++ scripts/7-deepseek-run-update-b.sh | 38 ++ ...t-run-update.sh => 7-qwen-run-update-a.sh} | 0 ...t-run-update.sh => 7-qwen-run-update-b.sh} | 0 scripts/prompt-design.ipynb | 21 +- test/test_canister_functions.py | 26 +- 16 files changed, 686 insertions(+), 608 deletions(-) create mode 100644 README-0001-b841d0.md create mode 100644 README-0002-615212.md create mode 100644 dfx.multiple-llms.json rename scripts/{6-a-test-new-chat.sh => 6-new-chat.sh} (100%) create mode 100755 scripts/7-deepseek-run-update-a.sh create mode 100755 scripts/7-deepseek-run-update-b.sh rename scripts/{6-b-test-run-update.sh => 7-qwen-run-update-a.sh} (100%) rename scripts/{6-c-test-run-update.sh => 7-qwen-run-update-b.sh} (100%) diff --git a/README-0001-b841d0.md b/README-0001-b841d0.md new file mode 100644 index 0000000..4174002 --- /dev/null +++ b/README-0001-b841d0.md @@ -0,0 +1,110 @@ +# DETAILS FOR UPGRADE from llama.cpp sha `615212` to `b841d0` + +### cpp_paths + +#### main_.cpp +`meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp` +- use `main_` instead of `main` +- A few items related to console & ctrl+C need to be outcommented + + +#### llama_cpp_onicai_fork/src/llama.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format` +- replace `throw` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- outcomment threading related items: + - `#include ` + - `#include ` + - `#include ` +- outcomment these functions completely: + - `llama_tensor_quantize_internal` + - `llama_model_quantize_internal` + + +#### llama_cpp_onicai_fork/src/llama-vocab.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- add a check on `llama_token_bos(model)`, else the llama2.c models never stop generating: + ``` + bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { + return token != -1 && ( + token == llama_token_eos_impl(vocab) || + token == llama_token_eot_impl(vocab) || + token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos + ); + } + ``` + +#### llama_cpp_onicai_fork/src/llama-grammar.cpp +No changes needed + +#### llama_cpp_onicai_fork/src/llama-sampling.cpp +No changes needed + +#### llama_cpp_onicai_fork/src/unicode-data.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/src/unicode.cpp +- add `#include "ic_api.h"` +- replace `throw` with `IC_API::trap` + +#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp +- add `#include "ic_api.h"` +- replace `throw` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + + +#### llama_cpp_onicai_fork/common/build-info.cpp +- run this command to create it: +``` +make build-info-cpp-wasm +``` + +#### llama_cpp_onicai_fork/common/grammar-parser.cpp +- add `#include "ic_api.h"` +- replace `throw` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/common/sampling.cpp +- add `#include "ic_api.h"` +- replace `throw` with `IC_API::trap` + +#### llama_cpp_onicai_fork/common/common.cpp +- add `#include "ic_api.h"` +- replace `throw` with `IC_API::trap` +- outcomment all code related to `` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- outcomment `std::getenv` + + +--- +### c_paths + +#### llama_cpp_onicai_fork/ggml/src/ggml.c +- outcomment all code related to signals + - `#include ` +- Many threading outcomments. + +#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-backend.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c +No updates needed for icpp-pro + +--- +### headers to modify + +#### llama_cpp_onicai_fork/common/log.h +- `#include ` +- Some other threading code + +#### llama_cpp_onicai_fork/common/common.h +- `#include ` \ No newline at end of file diff --git a/README-0002-615212.md b/README-0002-615212.md new file mode 100644 index 0000000..ba98c79 --- /dev/null +++ b/README-0002-615212.md @@ -0,0 +1,401 @@ +# DETAILS FOR UPGRADE from llama.cpp sha `b841d0` to `615212` + +### cpp_paths + +#### main_.cpp + +```bash +# from folder: llama_cpp_canister/src + +# To do the actual changes +meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp + +# To check what has changed between and +meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_/examples/main/main.cpp +``` +- use `main_` instead of `main` +- A few items related to console, ctrl+C & threading need to be outcommented +- Added logic for running in a canister with multiple update calls + + +#### llama_cpp_onicai_fork/src/llama.cpp +```bash +# from folder: llama_cpp_canister/src +# To do the actual changes +meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_/src/llama.cpp +``` +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- outcomment threading related items +- outcomment these functions completely: + - `llama_tensor_quantize_internal` + - `llama_model_quantize_internal` + + +#### llama_cpp_onicai_fork/src/llama-vocab.cpp +```bash +# from folder: llama_cpp_canister/src +meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_/src/llama-vocab.cpp +``` +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-grammar.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-sampling.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### llama_cpp_onicai_fork/src/llama-impl.cpp +- no modifications needed for the IC + +#### src/llama_cpp_onicai_fork/src/llama-context.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### src/llama_cpp_onicai_fork/src/llama-arch.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/src/unicode-data.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/src/unicode.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-kv-cache.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/src/llama-chat.cpp +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-mmap.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### llama_cpp_onicai_fork/src/llama-model.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-batch.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/src/llama-adapter.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/src/llama-model-loader.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment all uses of `validation_result`: + ```C++ + // ICPP-PATCH-START + // we do not support check_tensors. It requires threading. + // std::vector>> validation_result; + // ICPP-PATCH-END + ... several other references to validation_result + ``` +- outcomment all uses of `getenv` + +#### llama_cpp_onicai_fork/src/llama-hparams.cpp +- no modifications needed for the IC + +#### llama_cpp_onicai_fork/common/arg.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- outcomment args that require `std::thread` +- outcomment call to `ggml_backend_load_all();` + We are not loading the dynamic backends, because it is calling dlopen which results in + undefined symbols during linking. + We can skip it, because we already registered the CPU backend as a compile flag. +- outcomment all calls to std::getenv + +#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::out_of_range` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/common/build-info.cpp +- run this command to create it: +``` +make build-info-cpp-wasm +``` + +#### llama_cpp_onicai_fork/common/sampling.cpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### llama_cpp_onicai_fork/common/common.cpp +- add right below `#include llama.h`: + ```C++ + // ICPP-PATCH-START + #include "ic_api.h" + extern llama_model ** g_model; // The global variable from main_.cpp + // ICPP-PATCH-END + ``` +- In common_init_result, skip loading the model if the --model parameter is not provided: + ```C++ + // ICPP-PATCH-START + // Skip loading the model if the --model parameter is not provided + if (!params.model.empty()) { + // ICPP-PATCH-END + + ... + model = ... + ... + + // ICPP-PATCH-START + // Skip loading the model if the --model parameter is not provided + } else { + // Access the model through g_model and assign it to the local variable + model = *g_model; + } + // ICPP-PATCH-END + ``` +- In common_init_result, do NOT transfer ownership of the model pointer: + ```C++ + // ICPP-PATCH-START: + // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model + // We do NOT want the model to be freed when the unique_ptr goes out of scope + // iparams.model.reset(model); + // ICPP-PATCH-END + ``` +- replace `throw std::runtime_error` with `IC_API::trap` +- replace `throw std::invalid_argument` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +- outcomment `std::getenv` + Compare to changes made last time (!) + +- outcomment all code related to ``: + Compare to changes made last time (!) + - cpu_get_num_physical_cores + +- outcomment #ifdef LLAMA_USE_CURL + Compare to changes made last time (!) + +- outcomment `set_process_priority` function + +#### llama_cpp_onicai_fork/common/log.cpp +- Remove all threading logic + #include + #include + +#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp +- outcomment all uses of `getenv`: + ```C++ + // ICPP-PATCH-START + // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG"); + // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0; + sched->debug = 0; + // ICPP-PATCH-END + ``` + +#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp +- outcomment all code related to threading + +#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp +- Update dl_handle_deleter, to avoid a call to dlclose that should never happen + The linker ends up with undefined if we don't outcomment it + ```C++ + #include "ic_api.h" + struct dl_handle_deleter { + void operator()(void * handle) { + // ICPP-PATCH-START + // We are NOT dynamically loading any backend + // SO WE SHOULD NEVER GET HERE + // Avoid linker error by outcommenting this, but inserting a runtime trap + // dlclose(handle); + IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called"); + // ICPP-PATCH-END + } + }; + ``` + +#### llama_cpp_onicai_fork/ggml/src/gguf.cpp +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp +- outcomment all code related to signals & threading: + - `#include "ggml-threading.h"` + - `#include ` + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +No updates needed for icpp-pro + +--- +### c_paths + +#### llama_cpp_onicai_fork/ggml/src/ggml.c +- outcomment all code related to signals & threading + - `#include "ggml-threading.h"` + - `#include ` + + +#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c +No updates needed for icpp-pro + +#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c +No updates needed for icpp-pro + +--- +### headers to modify + +#### llama_cpp_onicai_fork/src/llama-model-loader.h +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` + +#### llama_cpp_onicai_fork/src/minja.hpp +- add `#include "ic_api.h"` +- replace `throw std::runtime_error` with `IC_API::trap` +- re-define two functions: + ```C++ + // ICPP-PATCH-START + // throw not supported, using IC_API::trap instead, which expects a string + // std::runtime_error unexpected(const TemplateToken & token) const { + // return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type) + // + error_location_suffix(*template_str, token.location.pos)); + // } + // std::runtime_error unterminated(const TemplateToken & token) const { + // return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type) + // + error_location_suffix(*template_str, token.location.pos)); + // } + std::string unexpected(const TemplateToken & token) const { + return ("Unexpected " + TemplateToken::typeToString(token.type) + + error_location_suffix(*template_str, token.location.pos)); + } + std::string unterminated(const TemplateToken & token) const { + return ("Unterminated " + TemplateToken::typeToString(token.type) + + error_location_suffix(*template_str, token.location.pos)); + } + // ICPP-PATCH-END + ``` +- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))` +- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))` +- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))` +- outcomment try-catch + +#### llama_cpp_onicai_fork/common/common.h +- Modify these: +``` + // ICPP-PATCH-START + // bool use_mmap = true; // use mmap for faster loads + bool use_mmap = false; // not in a canister... + // ICPP-PATCH-END + + // ICPP-PATCH-START + // We do NOT load a default model into the canister + // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" + #define DEFAULT_MODEL_PATH "" + // ICPP-PATCH-END +``` + +#### llama_cpp_onicai_fork/common/chat-template.hpp +- replace `throw std::runtime_error` with `IC_API::trap` +- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. + +#### llama_cpp_onicai_fork/ggml/include/ggml.h +- #define GGML_DEFAULT_N_THREADS 1 + +------------ +TODO: search in code files for: TODO-615212 + +(-) main_.cpp has a new static `global g_smpl`: + static common_sampler ** g_smpl; + + Q: Does this need to become a global variable, accessible from common.cpp ? + Like we did for g_model ? + + In `common/common.cpp` we added: + ``` + // ICPP-PATCH-START + #include "ic_api.h" + extern llama_model ** g_model; // The global variable from main_.cpp + // ICPP-PATCH-END + ``` + +(-) main_.cpp renamed type for `g_params`: + from: static gpt_params * g_params; + to : static common_params * g_params; + + Q: Does this need to become a global variable, accessible from common.cpp ? + Like we did for g_model ? + +(-) main_.cpp line 142: common_sampler * smpl = nullptr; + + Q: Does `smpl` need to become a static variable, like `model` & `ctx` ? + +(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going + + Q: Is this logic for prompt_remaining still valid? + +(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT + -> LOG is used just for Console/Stream Output + -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes... + + Q1: Did we change anything to LOG & LOG_TEE to get it to work ? + Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it. + Q3: Can we remove the LOG & LOG_TEE + Q4: Do we need to update the README about downloading different LOG files? + +(-) llama-vocab.cpp --- This function is no longer there. Is tinystories still working? + + We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating: + ``` + bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { + return token != -1 && ( + token == llama_token_eos_impl(vocab) || + token == llama_token_eot_impl(vocab) || + token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos + ); + } + ``` + +(-) TODO: `llama_cpp_onicai_fork/common/log.cpp` step through the logic + - Remove the pause() function + - Remove the cur.is_end function ? + +(-) TODO: Monitor memory, and make sure that ctx is freed up... + See free_ctx() method that has been outcommented in main_.cpp + +---------------------------------------------------------- +NOTES: + +(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp` + +(-) All the LLM architectures supported by llama_cpp_canister are listed in + `src/llama_cpp_onicai_fork/src/llama-arch.cpp` + +(-) NOTE: `common/grammar-parser.cpp` is no longer there. + It appears to be fully included in `src/llama-grammar.cpp` + +(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c` + +(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists + Previous update: No updates needed for icpp-pro + +(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time: + Previous update: + - `#include ` + - Some other threading code + +(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time: + Previous update: + - `#include ` \ No newline at end of file diff --git a/README-contributors-guide.md b/README-contributors-guide.md index eec4edc..16415b0 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -4,75 +4,6 @@ Follow steps of [llama_cpp_canister/README/Getting Started](https://github.com/onicai/llama_cpp_canister/blob/main/README.md#getting-started) -# VS Code debugger - -## lldb-mi hangs - -On the Mac, there is an issue with lldb-mi: https://github.com/microsoft/vscode-cpptools/issues/7240 - -Upon stopping at a breakpoint in a new module, lldb-mi will try to load all local variables, and it goes into an endless loop. - -The solution is to hide the VARIABLES section in the debug window, and rely on the WATCH section instead. - -# How to run & debug original llama.cpp - -- Clone ggerganov/llama.cpp (Do NOT initialize submodules...) - ``` - # Clone it as a sibling repo of llama_cpp_canister - git clone https://github.com/ggerganov/llama.cpp.git - ``` -- Checkout the proper commit used as root of the onicai branch in llama_cpp_onicai_fork - ``` - git checkout b841d0 - ``` -- Build with these commands: - ``` - make clean - make LLAMA_DEBUG=1 llama-cli - ``` -- Run with Notebook - - File: scripts/prompt-design.ipynb - -- Run with this command: - ``` - ./llama-cli -m ../llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512 -fa -ngl 80 - ``` - In above command, the `-fa -ngl 80` arguments are useful only on GPU. We do not use them when calling the IC, because - the canister has a CPU only. - -- Debug using this `.vscode/launch.json` - ```json - { - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "type": "lldb", - "request": "launch", - "name": "llama-cli", - "program": "${workspaceFolder}/llama-cli", - "cwd": "${workspaceFolder}", - "args": [ - "-m", - "/llama_cpp_canister_models/stories260Ktok512.gguf", - "--samplers", - "top_p", - "--temp", - "0.1", - "--top-p", - "0.9", - "-n", - "600", - "-p", - "Joe loves writing stories" - ] - } - ] - } - ``` # How to upgrade llama.cpp ## Sync fork @@ -95,18 +26,20 @@ git fetch upstream --tags # after this, the tags will appear in GitHub git push origin --tags - - ``` ## llama_cpp_onicai_fork: setup a local branch Take following steps locally: - git fetch -- This is the git-sha of the llama.cpp versions we branched from: - - `615212` (git-sha-new) , with release-tag `b4532` - - `b841d0` (git-sha-old) , no release-tag - - `5cdb37` (git-sha-older), no release-tag +- These are the git-sha values of the llama.cpp versions we branched from: + + | upgrade # | llama.cpp sha | llama.cpp release-tag | + | --------- | ------------- | --------------------- | + | 0000 | 5cdb37 | - | + | 0001 | b841d0 | - | + | 0002 | 615212 | b4532 | + - Start with a fresh clone of llama_cpp_onicai_fork: ```bash @@ -139,332 +72,15 @@ We use `meld` for comparing the files: brew install --cask dehesselle-meld ``` -### cpp_paths - -#### main_.cpp - -```bash -# from folder: llama_cpp_canister/src - -# To do the actual changes -meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp - -# To check what has changed between and -meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_/examples/main/main.cpp -``` -- use `main_` instead of `main` -- A few items related to console, ctrl+C & threading need to be outcommented -- Added logic for running in a canister with multiple update calls - - -#### llama_cpp_onicai_fork/src/llama.cpp -```bash -# from folder: llama_cpp_canister/src -# To do the actual changes -meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_/src/llama.cpp -``` -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -- outcomment threading related items -- outcomment these functions completely: - - `llama_tensor_quantize_internal` - - `llama_model_quantize_internal` - - -#### llama_cpp_onicai_fork/src/llama-vocab.cpp -```bash -# from folder: llama_cpp_canister/src -meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_/src/llama-vocab.cpp -``` -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-grammar.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-sampling.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` - -#### llama_cpp_onicai_fork/src/llama-impl.cpp -- no modifications needed for the IC - -#### src/llama_cpp_onicai_fork/src/llama-context.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` - -#### src/llama_cpp_onicai_fork/src/llama-arch.cpp -- no modifications needed for the IC - -#### llama_cpp_onicai_fork/src/unicode-data.cpp -- no modifications needed for the IC - -#### llama_cpp_onicai_fork/src/unicode.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- replace `throw std::invalid_argument` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-kv-cache.cpp -- no modifications needed for the IC - -#### llama_cpp_onicai_fork/src/llama-chat.cpp -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-mmap.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` - -#### llama_cpp_onicai_fork/src/llama-model.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-batch.cpp -- no modifications needed for the IC - -#### llama_cpp_onicai_fork/src/llama-adapter.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/src/llama-model-loader.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment all uses of `validation_result`: - ```C++ - // ICPP-PATCH-START - // we do not support check_tensors. It requires threading. - // std::vector>> validation_result; - // ICPP-PATCH-END - ... several other references to validation_result - ``` -- outcomment all uses of `getenv` - -#### llama_cpp_onicai_fork/src/llama-hparams.cpp -- no modifications needed for the IC - -#### llama_cpp_onicai_fork/common/arg.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- replace `throw std::invalid_argument` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -- outcomment args that require `std::thread` -- outcomment call to `ggml_backend_load_all();` - We are not loading the dynamic backends, because it is calling dlopen which results in - undefined symbols during linking. - We can skip it, because we already registered the CPU backend as a compile flag. -- outcomment all calls to std::getenv - -#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- replace `throw std::out_of_range` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/common/build-info.cpp -- run this command to create it: -``` -make build-info-cpp-wasm -``` - -#### llama_cpp_onicai_fork/common/sampling.cpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` - -#### llama_cpp_onicai_fork/common/common.cpp -- add right below `#include llama.h`: - ```C++ - // ICPP-PATCH-START - #include "ic_api.h" - extern llama_model ** g_model; // The global variable from main_.cpp - // ICPP-PATCH-END - ``` -- In common_init_result, skip loading the model if the --model parameter is not provided: - ```C++ - // ICPP-PATCH-START - // Skip loading the model if the --model parameter is not provided - if (!params.model.empty()) { - // ICPP-PATCH-END - - ... - model = ... - ... - - // ICPP-PATCH-START - // Skip loading the model if the --model parameter is not provided - } else { - // Access the model through g_model and assign it to the local variable - model = *g_model; - } - // ICPP-PATCH-END - ``` -- In common_init_result, do NOT transfer ownership of the model pointer: - ```C++ - // ICPP-PATCH-START: - // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model - // We do NOT want the model to be freed when the unique_ptr goes out of scope - // iparams.model.reset(model); - // ICPP-PATCH-END - ``` -- replace `throw std::runtime_error` with `IC_API::trap` -- replace `throw std::invalid_argument` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. -- outcomment `std::getenv` - Compare to changes made last time (!) - -- outcomment all code related to ``: - Compare to changes made last time (!) - - cpu_get_num_physical_cores - -- outcomment #ifdef LLAMA_USE_CURL - Compare to changes made last time (!) - -- outcomment `set_process_priority` function - -#### llama_cpp_onicai_fork/common/log.cpp -- Remove all threading logic - #include - #include - -#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp -- outcomment all uses of `getenv`: - ```C++ - // ICPP-PATCH-START - // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG"); - // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0; - sched->debug = 0; - // ICPP-PATCH-END - ``` - -#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp -- outcomment all code related to threading - -#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp -- Update dl_handle_deleter, to avoid a call to dlclose that should never happen - The linker ends up with undefined if we don't outcomment it - ```C++ - #include "ic_api.h" - struct dl_handle_deleter { - void operator()(void * handle) { - // ICPP-PATCH-START - // We are NOT dynamically loading any backend - // SO WE SHOULD NEVER GET HERE - // Avoid linker error by outcommenting this, but inserting a runtime trap - // dlclose(handle); - IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called"); - // ICPP-PATCH-END - } - }; - ``` - -#### llama_cpp_onicai_fork/ggml/src/gguf.cpp -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. - -#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp -- outcomment all code related to signals & threading: - - `#include "ggml-threading.h"` - - `#include ` - -#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp -No updates needed for icpp-pro - ---- -### c_paths - -#### llama_cpp_onicai_fork/ggml/src/ggml.c -- outcomment all code related to signals & threading - - `#include "ggml-threading.h"` - - `#include ` - - -#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c -No updates needed for icpp-pro - -#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c -No updates needed for icpp-pro - -#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c -No updates needed for icpp-pro - -#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c -No updates needed for icpp-pro - ---- -### headers to modify - -#### llama_cpp_onicai_fork/src/llama-model-loader.h -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` - -#### llama_cpp_onicai_fork/src/minja.hpp -- add `#include "ic_api.h"` -- replace `throw std::runtime_error` with `IC_API::trap` -- re-define two functions: - ```C++ - // ICPP-PATCH-START - // throw not supported, using IC_API::trap instead, which expects a string - // std::runtime_error unexpected(const TemplateToken & token) const { - // return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type) - // + error_location_suffix(*template_str, token.location.pos)); - // } - // std::runtime_error unterminated(const TemplateToken & token) const { - // return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type) - // + error_location_suffix(*template_str, token.location.pos)); - // } - std::string unexpected(const TemplateToken & token) const { - return ("Unexpected " + TemplateToken::typeToString(token.type) - + error_location_suffix(*template_str, token.location.pos)); - } - std::string unterminated(const TemplateToken & token) const { - return ("Unterminated " + TemplateToken::typeToString(token.type) - + error_location_suffix(*template_str, token.location.pos)); - } - // ICPP-PATCH-END - ``` -- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))` -- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))` -- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))` -- outcomment try-catch - -#### llama_cpp_onicai_fork/common/common.h -- Modify these: -``` - // ICPP-PATCH-START - // bool use_mmap = true; // use mmap for faster loads - bool use_mmap = false; // not in a canister... - // ICPP-PATCH-END - - // ICPP-PATCH-START - // We do NOT load a default model into the canister - // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" - #define DEFAULT_MODEL_PATH "" - // ICPP-PATCH-END -``` - -#### llama_cpp_onicai_fork/common/chat-template.hpp -- replace `throw std::runtime_error` with `IC_API::trap` -- outcomment `try - catch`. The program will abrupt in case of thrown exceptions. +## Details for each upgrade -#### llama_cpp_onicai_fork/ggml/include/ggml.h -- #define GGML_DEFAULT_N_THREADS 1 +See the files: README--.md -## llama_cpp_onicai_fork: replace `onicai` branch +## Branch management -TODO: RETHINK THIS LOGIC... -(-) Perhaps it is better to keep all the `onicai-` branches -(-) And just change the default branch to `onicai-` +We need to rethink this logic, but for now it is ok... -That way: -(-) when someone clones, the are at the correct branch -(-) from the name, it is immediately clear what llama.cpp version was used -(-) we preserve the full history - ---- +### llama_cpp_onicai_fork Do NOT merge the `onicai-` branch into the `onicai` branch, but replace it: ``` @@ -474,134 +90,6 @@ git push origin onicai:onicai git push origin onicai-:onicai- ``` +## llama_cpp_canister ------------- -TODO: search in code files for: TODO-615212 - -(-) main_.cpp has a new static `global g_smpl`: - static common_sampler ** g_smpl; - - Q: Does this need to become a global variable, accessible from common.cpp ? - Like we did for g_model ? - - In `common/common.cpp` we added: - ``` - // ICPP-PATCH-START - #include "ic_api.h" - extern llama_model ** g_model; // The global variable from main_.cpp - // ICPP-PATCH-END - ``` - -(-) main_.cpp renamed type for `g_params`: - from: static gpt_params * g_params; - to : static common_params * g_params; - - Q: Does this need to become a global variable, accessible from common.cpp ? - Like we did for g_model ? - -(-) main_.cpp line 142: common_sampler * smpl = nullptr; - - Q: Does `smpl` need to become a static variable, like `model` & `ctx` ? - -(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going - - Q: Is this logic for prompt_remaining still valid? - -(-) main_.cpp line 208: // ICPP-TODO-START: This section is completely new... - COMPLETELY NEW SECTION FOR THREADPOOLs... - -(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT - -> LOG is used just for Console/Stream Output - -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes... - - Q1: Did we change anything to LOG & LOG_TEE to get it to work ? - Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it. - Q3: Can we remove the LOG & LOG_TEE - Q4: Do we need to update the README about downloading different LOG files? - -(-) main_.cpp calls common_token_to_piece instead of llama_token_to_piece - - Q: Is this a new file: common_token_to_piece - A: No, it is in common.cpp - -(-) main_.cpp calls common_tokenize instead of llama_tokenize - - Q: Is this a new file: common_tokenize - A: No, it is in common.cpp - -(-) main_.cpp line 516, 826: New sampling subsystem ! - - Q: Are these new files: - - common_sampler_init - - common_sampler_sample - - common_sampler_accept - A: No, it is in sampling.cpp - -(-) main_.cpp line 1123: common_sampler_free(smpl) - - We had outcommented code to NOT free the ctx & model storage: - // Do NOT free ctx & model storage - // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM - // -> we do NOT reset g_ctx & g_model - // -> we moved this into a free_model function, which can be called by canister's load_model - // llama_free(ctx); - // llama_free_model(model); - - // TODO-615212 -- Make sure this is correct - // Do reset all other static memory - reset_static_memory(); - - Q1: Has this all moved into common_sampler_free ? - - Q2: Update usage of the free_model function? - - Q3: is reset_static_memory still correct ? - - Q4: Is llama_sampling_free(ctx_sampling) now handled by common_sampler_free(smpl) ? - - -(-) llama-vocab.cpp --- This function is no longer there. Is tinystories still working? - - We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating: - ``` - bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { - return token != -1 && ( - token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) || - token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos - ); - } - ``` - -(-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic - - Remove the pause() function - - Remove the cur.is_end function ? - -(-) Monitor memory, and make sure that ctx is freed up... - See free_ctx() method that has been outcommented in main_.cpp - ----------------------------------------------------------- -NOTES: - -(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp` - This is from Google, and a general chat_template, with tool calling !!! - -(-) All the LLM architectures supported by llama_cpp_canister are listed in - `src/llama_cpp_onicai_fork/src/llama-arch.cpp` - -(-) NOTE: `common/grammar-parser.cpp` is no longer there. - It appears to be fully included in `src/llama-grammar.cpp` - -(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c` - -(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists - Previous update: No updates needed for icpp-pro - -(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time: - Previous update: - - `#include ` - - Some other threading code - -(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time: - Previous update: - - `#include ` \ No newline at end of file +Merge the `onicai-` branch into the `onicai` branch \ No newline at end of file diff --git a/README.md b/README.md index bcd0397..5a3935f 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,21 @@ Please join our [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aa # Capabilities πŸ”₯ -- You can deploy LLMs up to ~0.5B parameters. -- The full context window of the LLM is used. (128K tokens for the Qwen2.5 example below.) - +- Deploy any LLM available as a gguf file. +- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)). # Set up -WARNING: Currently, the canister can only be build on a `Mac` ! +The build of the wasm must be done on a `Mac` ! + +- Install dfx: + + ```bash + sh -ci "$(curl -fsSL https://internetcomputer.org/install.sh)" + + # Configure your shell + source "$HOME/.local/share/dfx/env" + ``` - Clone the repo and it's children: @@ -48,12 +56,6 @@ WARNING: Currently, the canister can only be build on a `Mac` ! git clone git@github.com:onicai/llama_cpp_onicai_fork.git ``` -- Create the file src/llama_cpp_onicai_fork/common/build-info.cpp - ``` - # from ./llama_cpp_canister folder - make build-info-cpp-wasm - ``` - - Create a Python environment with dependencies installed ❗❗❗ Use Python 3.11 ❗❗❗ @@ -70,26 +72,14 @@ WARNING: Currently, the canister can only be build on a `Mac` ! pip install -r requirements.txt ``` -- Install dfx: - - ```bash - sh -ci "$(curl -fsSL https://internetcomputer.org/install.sh)" - - # Configure your shell - source "$HOME/.local/share/dfx/env" - ``` - - Build & Deploy the canister `llama_cpp`: - Compile & link to WebAssembly (wasm): ```bash + make build-info-cpp-wasm icpp build-wasm ``` - Note: - - The first time you run this command, the tool-chain will be installed in ~/.icpp - - This can take a few minutes, depending on your internet speed and computer. + Note: The first time you run this command, the tool-chain will be installed in ~/.icpp - Start the local network: ```bash @@ -124,24 +114,22 @@ WARNING: Currently, the canister can only be build on a `Mac` ! ```bash python -m scripts.upload --network local --canister llama_cpp --canister-filename models/model.gguf models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf ``` - - - Only needed after a canister upgrade (`dfx deploy -m upgrade`), re-load the gguf file into Orthogonal Persisted (OP) working memory - - This step is already done by scripts.upload above, so you can skip it if you just ran that. - After a canister upgrade, the gguf file in the canister is still there, because it is persisted in - stable memory, but you need to load it into Orthogonal Persisted (working) memory, which is erased during a canister upgrade. + NOTE: In C++, files are stored in stable memory of the canister. + They will survive a code upgrade. + +- Load the gguf file into Orthogonal Persisted (OP) working memory - ```bash - dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/model.gguf";} })' - ``` + ```bash + dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/model.gguf";} })' + ``` - - Set the max_tokens for this model, to avoid it hits the IC's instruction limit - ``` - dfx canister call llama_cpp set_max_tokens '(record { max_tokens_query = 10 : nat64; max_tokens_update = 10 : nat64 })' +- Set the max_tokens for this model, to avoid it hits the IC's instruction limit + ``` + dfx canister call llama_cpp set_max_tokens '(record { max_tokens_query = 10 : nat64; max_tokens_update = 10 : nat64 })' - dfx canister call llama_cpp get_max_tokens - ``` + dfx canister call llama_cpp get_max_tokens + ``` - Chat with the LLM @@ -202,14 +190,13 @@ WARNING: Currently, the canister can only be build on a `Mac` ! ``` Note: The sequence of update calls to the canister is required because the Internet Computer has a limitation - on the number of computations it allows per call. At the moment, only 10 tokens can be generated per call. + on the number of instructions it allows per call. For this model, 10 tokens can be generated per update call. + This sequence of update calls is equivalent to using the [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) repo directly and running the `llama-cli` locally, with the command: ``` - ./llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512 -fa -ngl 80 + /llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512 ``` - In above command, the `-fa -ngl 80` arguments are useful only on GPU. We do not use them when calling the IC, because - the canister has a CPU only. - Retrieving saved chats @@ -220,9 +207,8 @@ WARNING: Currently, the canister can only be build on a `Mac` ! dfx canister call llama_cpp get_chats ``` - - -- You can download the `main.log` file from the canister with: +TODO-615212: there is no longer a main.log file? +- For debug purposes, you can download the `main.log` file from the canister with: ``` python -m scripts.download --network local --canister llama_cpp --local-filename main.log main.log ``` diff --git a/dfx.json b/dfx.json index dddeeb0..03999f0 100644 --- a/dfx.json +++ b/dfx.json @@ -5,16 +5,6 @@ "type": "custom", "candid": "src/llama_cpp.did", "wasm": "build/llama_cpp.wasm" - }, - "llm_0": { - "type": "custom", - "candid": "src/llama_cpp.did", - "wasm": "build/llama_cpp.wasm" - }, - "llm_1": { - "type": "custom", - "candid": "src/llama_cpp.did", - "wasm": "build/llama_cpp.wasm" } }, "defaults": { diff --git a/dfx.multiple-llms.json b/dfx.multiple-llms.json new file mode 100644 index 0000000..aab85df --- /dev/null +++ b/dfx.multiple-llms.json @@ -0,0 +1,21 @@ +{ + "version": 1, + "canisters": { + "llm_0": { + "type": "custom", + "candid": "src/llama_cpp.did", + "wasm": "build/llama_cpp.wasm" + }, + "llm_1": { + "type": "custom", + "candid": "src/llama_cpp.did", + "wasm": "build/llama_cpp.wasm" + } + }, + "defaults": { + "build": { + "args": "", + "packtool": "" + } + } +} \ No newline at end of file diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh index 5787ab7..6a6dc25 100755 --- a/scripts/3-upload-model.sh +++ b/scripts/3-upload-model.sh @@ -16,8 +16,9 @@ NUM_LLMS_DEPLOYED=1 # MODEL="models/stories260Ktok512.gguf" # MODEL="models/stories15Mtok4096.gguf" # MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf" -MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf" -# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf" +# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf" +MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf" +# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q2_K.gguf" # Parse command line arguments for network type while [ $# -gt 0 ]; do diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh index 67cd9d0..0514dde 100755 --- a/scripts/4-load-model.sh +++ b/scripts/4-load-model.sh @@ -58,7 +58,7 @@ do echo "--------------------------------------------------" echo "Calling load_model for llm_$i" output=$(dfx canister call llm_$i load_model \ - '(record { args = vec {"--model"; "models/model.gguf";} })' \ + '(record { args = vec {"--model"; "models/model.gguf"; "--no-warmup";} })' \ --network "$NETWORK_TYPE") if ! echo "$output" | grep -q " Ok "; then diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh index f3ee2e1..b5fcc94 100755 --- a/scripts/5-set-max-tokens.sh +++ b/scripts/5-set-max-tokens.sh @@ -12,7 +12,8 @@ NUM_LLMS_DEPLOYED=1 # MAX_TOKENS=128 # stories260Ktok512.gguf # MAX_TOKENS=60 # stories15Mtok4096.gguf # MAX_TOKENS=20 # SmolLM2-135M-Instruct-Q4_K_M.gguf -MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf +# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf +MAX_TOKENS=2 # DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf # Parse command line arguments for network type while [ $# -gt 0 ]; do diff --git a/scripts/6-a-test-new-chat.sh b/scripts/6-new-chat.sh similarity index 100% rename from scripts/6-a-test-new-chat.sh rename to scripts/6-new-chat.sh diff --git a/scripts/7-deepseek-run-update-a.sh b/scripts/7-deepseek-run-update-a.sh new file mode 100755 index 0000000..aee1a5b --- /dev/null +++ b/scripts/7-deepseek-run-update-a.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/test.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +i=0 # llm_$i will be tested + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +echo " " +echo "--------------------------------------------------" +echo "Calling run_update for llm_$i" +# See model card at : https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF +dfx canister call llm_$i run_update '(record { args = vec {"--cache-type-k"; "q8_0"; "--no-warmup"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|User|>What is 1+1?<|Assistant|>";} })' diff --git a/scripts/7-deepseek-run-update-b.sh b/scripts/7-deepseek-run-update-b.sh new file mode 100755 index 0000000..d50f47e --- /dev/null +++ b/scripts/7-deepseek-run-update-b.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +####################################################################### +# run from parent folder as: +# scripts/test.sh --network [local|ic] +####################################################################### + +# Default network type is local +NETWORK_TYPE="local" +i=0 # llm_$i will be tested + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic'." + exit 1 + fi + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +echo " " +echo "--------------------------------------------------" +echo "Calling run_update for llm_$i" +dfx canister call llm_$i run_update '(record { args = vec {"--cache-type-k"; "q8_0"; "--no-warmup"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "";} })' diff --git a/scripts/6-b-test-run-update.sh b/scripts/7-qwen-run-update-a.sh similarity index 100% rename from scripts/6-b-test-run-update.sh rename to scripts/7-qwen-run-update-a.sh diff --git a/scripts/6-c-test-run-update.sh b/scripts/7-qwen-run-update-b.sh similarity index 100% rename from scripts/6-c-test-run-update.sh rename to scripts/7-qwen-run-update-b.sh diff --git a/scripts/prompt-design.ipynb b/scripts/prompt-design.ipynb index 44adc41..35c3e2d 100644 --- a/scripts/prompt-design.ipynb +++ b/scripts/prompt-design.ipynb @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -70,18 +70,19 @@ "outputs": [], "source": [ "# Define where the llama-cli is located, relative to this notebook\n", - "LLAMA_CLI_PATH = \"../../ggerganov_llama_b841d0.cpp/llama-cli\" # Current llama_cpp_canister version\n", - "# LLAMA_CLI_PATH = \"../../ggerganov_llama_latest.cpp/build/bin/llama-cli\"\n", + "# LLAMA_CLI_PATH = \"../../ggerganov_llama_b841d0.cpp/llama-cli\" # Current llama_cpp_canister version\n", + "LLAMA_CLI_PATH = \"../../ggerganov_llama_latest.cpp/build/bin/llama-cli\"\n", "\n", "# Select a model to use\n", - "MODEL = \"../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf\"\n", + "# MODEL = \"../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf\"\n", "# MODEL = \"../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q8_0.gguf\"\n", "# MODEL = (\n", "# \"../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf\"\n", "# )\n", - "# MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf\"\n", + "MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf\"\n", "# MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q2_K.gguf\"\n", "\n", + "\n", "def run_llama_cpp(\n", " prompt,\n", " num_tokens,\n", @@ -101,9 +102,11 @@ " LLAMA_CLI_PATH,\n", " \"-m\",\n", " MODEL,\n", + " \"--no-warmup\",\n", + " \"-no-cnv\",\n", " # \"--simple-io\",\n", - " \"--no-display-prompt\", # only return the generated text, without special characters\n", - " # \"-sp\", # output special tokens\n", + " # \"--no-display-prompt\", # only return the generated text, without special characters\n", + " \"-sp\", # output special tokens\n", " \"-n\",\n", " f\"{num_tokens}\",\n", " \"--seed\",\n", @@ -133,7 +136,7 @@ " # Print the command on a single line for terminal use, preserving \\n\n", " print(\n", " \"\\nCommand:\\n\",\n", - " f\"{LLAMA_CLI_PATH} -m {MODEL} --no-display-prompt -n {num_tokens} --seed {seed} --temp {temp} -p '{prompt}'\".replace(\n", + " f\"{LLAMA_CLI_PATH} -m {MODEL} --no-warmup -no-cnv -sp -n {num_tokens} --seed {seed} --temp {temp} -p '{prompt}'\".replace(\n", " \"\\n\", \"\\\\n\"\n", " ),\n", " )\n", @@ -158,7 +161,7 @@ "# mirostat_lr = 0.1\n", "# mirostat_ent = 5.0\n", "\n", - "prompt = f\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\ngive me a short introduction to LLMs.<|im_end|>\\n<|im_start|>assistant\\n\"\n", + "prompt = f\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\nWhat is the Proof-of-AI-Work Protocol?<|im_end|>\\n<|im_start|>assistant\\n\"\n", "response = run_llama_cpp(\n", " prompt,\n", " num_tokens,\n", diff --git a/test/test_canister_functions.py b/test/test_canister_functions.py index 23c6eee..009e7e7 100644 --- a/test/test_canister_functions.py +++ b/test/test_canister_functions.py @@ -60,46 +60,46 @@ def test__get_access_err(identity_anonymous: Dict[str, str], network: str) -> No expected_response = '(variant { Err = variant { Other = "Access Denied" } })' assert response == expected_response -def test__set_access_0(network: str) -> None: +def test__set_access_1(network: str) -> None: response = call_canister_api( dfx_json_path=DFX_JSON_PATH, canister_name=CANISTER_NAME, canister_method="set_access", - canister_argument='(record { level = 0 : nat16 })', + canister_argument='(record { level = 1 : nat16 })', network=network, ) - expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })' + expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })' assert response == expected_response -def test__get_access_0(network: str) -> None: +def test__get_access_1(network: str) -> None: response = call_canister_api( dfx_json_path=DFX_JSON_PATH, canister_name=CANISTER_NAME, canister_method="get_access", - canister_argument='(record { level = 0 : nat16 })', + canister_argument='(record { level = 1 : nat16 })', network=network, ) - expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })' + expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })' assert response == expected_response -def test__set_access_1(network: str) -> None: +def test__set_access_0(network: str) -> None: response = call_canister_api( dfx_json_path=DFX_JSON_PATH, canister_name=CANISTER_NAME, canister_method="set_access", - canister_argument='(record { level = 1 : nat16 })', + canister_argument='(record { level = 0 : nat16 })', network=network, ) - expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })' + expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })' assert response == expected_response -def test__get_access_1(network: str) -> None: +def test__get_access_0(network: str) -> None: response = call_canister_api( dfx_json_path=DFX_JSON_PATH, canister_name=CANISTER_NAME, canister_method="get_access", - canister_argument='(record { level = 1 : nat16 })', + canister_argument='(record { level = 0 : nat16 })', network=network, ) - expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })' - assert response == expected_response \ No newline at end of file + expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })' + assert response == expected_response From ba58d3bfcda1a133695badaf97c4a6e1c5f1104f Mon Sep 17 00:00:00 2001 From: icpp Date: Wed, 29 Jan 2025 16:01:30 -0500 Subject: [PATCH 15/25] Do not call load_model from upload.py It is better to separate this out into another step. --- scripts/upload.py | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/scripts/upload.py b/scripts/upload.py index de4e492..4bf1deb 100644 --- a/scripts/upload.py +++ b/scripts/upload.py @@ -65,8 +65,6 @@ def main() -> int: dfx_json_path = ROOT_PATH / "dfx.json" - uploading_gguf = local_filename_path.suffix.lower() == ".gguf" - print( f"Summary:" f"\n - canister_filename = {canister_filename}" @@ -77,7 +75,6 @@ def main() -> int: f"\n - canister_id = {canister_id}" f"\n - dfx_json_path = {dfx_json_path}" f"\n - candid_path = {candid_path}" - f"\n - uploading_gguf = {uploading_gguf}" ) # --------------------------------------------------------------------------- @@ -158,39 +155,7 @@ def main() -> int: offset += len(chunk) - # --------------------------------------------------------------------------- - # Do something special if we're uploading a llama_cpp_canister model (gguf) - if uploading_gguf: - # load the model inside the canister into Orthogonal Persisted memory - print( - "--\nInstruct canister to load the model, getting it ready for inference." - ) - response = canister_instance.load_model( - {"args": ["--model", canister_filename]} - ) - if "Ok" in response[0].keys(): - if DEBUG_VERBOSE >= 2: - print("OK!") - else: - print("Something went wrong:") - print(response) - sys.exit(1) - - # --------------------------------------------------------------------------- - # check readiness for inference - print("--\nChecking if the canister is ready for inference.") - response = canister_instance.ready() - if "Ok" in response[0].keys(): - if DEBUG_VERBOSE >= 2: - print("OK!") - else: - print("Something went wrong:") - print(response) - sys.exit(1) - - print(f"--\nCongratulations, canister {canister_name} is ready for inference!") - else: - print(f"--\nCongratulations, the file {local_filename_path} was uploaded!") + print(f"--\nCongratulations, the file {local_filename_path} was uploaded!") try: print("πŸ’― πŸŽ‰ 🏁") From b7bd4cb59075347dce0d7df577064e16014d8b67 Mon Sep 17 00:00:00 2001 From: icpp Date: Wed, 29 Jan 2025 16:02:56 -0500 Subject: [PATCH 16/25] Update comment --- src/main_.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main_.cpp b/src/main_.cpp index 45815d7..21495d0 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -1128,10 +1128,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only common_sampler_free(smpl); // ICPP-PATCH-START - - // TODO-615212 -- Make sure this is correct - // LEAVE IT IN - // Do reset all other static memory + // Reset all static memory we do not want to carry over to the next update call reset_static_memory(); // ICPP-PATCH-END From bf6141d14da6088b6eb8a87d2c21e1892f2699a4 Mon Sep 17 00:00:00 2001 From: icpp Date: Fri, 31 Jan 2025 15:56:05 -0500 Subject: [PATCH 17/25] For clarity, dfx.json uses the .did file in 'build' folder --- dfx.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfx.json b/dfx.json index 03999f0..a6a11ef 100644 --- a/dfx.json +++ b/dfx.json @@ -3,7 +3,7 @@ "canisters": { "llama_cpp": { "type": "custom", - "candid": "src/llama_cpp.did", + "candid": "build/llama_cpp.did", "wasm": "build/llama_cpp.wasm" } }, From 7ff8fa65c4c77105ef3880ae94a2a0ef33816965 Mon Sep 17 00:00:00 2001 From: icpp Date: Sat, 1 Feb 2025 09:48:10 -0500 Subject: [PATCH 18/25] remove_log_file Logging changed in this version, and we need to provide mechanism to use remove log files. --- README-0002-615212.md | 11 +++ README.md | 27 +++++++- native/test_qwen2.cpp | 11 +++ native/test_tiny_stories.cpp | 126 +++++++++++++++++++++++++---------- src/llama_cpp.did | 1 + src/main_.cpp | 3 + src/model.cpp | 1 + src/run.cpp | 100 +++++++++++++++++++-------- src/run.h | 1 + 9 files changed, 218 insertions(+), 63 deletions(-) diff --git a/README-0002-615212.md b/README-0002-615212.md index ba98c79..1b7bf47 100644 --- a/README-0002-615212.md +++ b/README-0002-615212.md @@ -188,6 +188,17 @@ make build-info-cpp-wasm - outcomment `set_process_priority` function #### llama_cpp_onicai_fork/common/log.cpp +- Add function `common_log_remove_file` to the public API + ```C++ + // ICPP-PATCH-START + // We need to add a public function to remove the log file from the canister + void common_log_remove_file(struct common_log * log) { + log->remove_file(); + } + // ICPP-PATCH-END + ``` +- Add public function `remove_file` to the struct common_log: + - Remove all threading logic #include #include diff --git a/README.md b/README.md index 5a3935f..b18ca50 100644 --- a/README.md +++ b/README.md @@ -207,10 +207,31 @@ The build of the wasm must be done on a `Mac` ! dfx canister call llama_cpp get_chats ``` -TODO-615212: there is no longer a main.log file? -- For debug purposes, you can download the `main.log` file from the canister with: - ``` +- For debug purposes, you can tell the canister to log to a file and download it afterwards: + + ```bash + # Start a new chat - this resets the prompt-cache for this conversation + dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' + + # Pass '"--log-file"; "main.log";' to the `run_update` calls: + + # Repeat this call until `prompt_remaining` in the response is empty. + # This ingest the prompt into the prompt-cache, using multiple update calls + # Important: KEEP SENDING THE FULL PROMPT + dfx canister call llama_cpp run_update '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' + ... + + # Once `prompt_remaining` in the response is empty, repeat this call, with an empty prompt, until `generated_eog=true` + # Now the LLM is generating new tokens ! + dfx canister call llama_cpp run_update '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })' + + + # Download the `main.log` file from the canister: python -m scripts.download --network local --canister llama_cpp --local-filename main.log main.log + + # Cleanup, by deleting both the log & prompt.cache files in the canister: + dfx canister call llama_cpp remove_prompt_cache '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' + dfx canister call llama_cpp remove_log_file '(record { args = vec {"--log-file"; "main.log"} })' ``` ## Smoke testing the deployed LLM diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp index aada78b..a7b99f2 100644 --- a/native/test_qwen2.cpp +++ b/native/test_qwen2.cpp @@ -161,4 +161,15 @@ void test_qwen2(MockIC &mockIC) { "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865", "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100850143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368652064656c65746564207375636365737366756c6c790000c8000000", silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Remove the log-file file if it exists + // '(record { args = vec {"--log-file"; "main.log"} })' -> + // '(variant { Ok = record { status_code = 200 : nat16; output = "Cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/main.log deleted successfully"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' + mockIC.run_test( + std::string(__func__) + ": " + "remove_log_file " + model, + remove_prompt_cache, + "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67", + "4449444c026b01bc8a01016c06819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e010000810143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f6d61696e2e6c6f672064656c65746564207375636365737366756c6c790000c8000000", + silent_on_trap, my_principal); } \ No newline at end of file diff --git a/native/test_tiny_stories.cpp b/native/test_tiny_stories.cpp index 90cb12d..5e3408b 100644 --- a/native/test_tiny_stories.cpp +++ b/native/test_tiny_stories.cpp @@ -93,40 +93,80 @@ void test_tiny_stories(MockIC &mockIC) { // Let's have two chats with this model for (int i = 0; i < 2; ++i) { - // ----------------------------------------------------------------------------- - // Start a new chat, which will remove the prompt-cache file if it exists - // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' - mockIC.run_test( - std::string(__func__) + ": " + "new_chat " + std::to_string(i) + - " - " + model, - new_chat, - "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865", - "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000", - silent_on_trap, my_principal); - - // ----------------------------------------------------------------------------- - // Generate tokens from prompt while saving everything to cache, - // without re-reading the model ! - // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' - // -> ... - mockIC.run_test( - std::string(__func__) + ": " + "run_update for chat " + - std::to_string(i) + " - " + model, - run_update, - "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573", - "", silent_on_trap, my_principal); - - // ----------------------------------------------------------------------------- - // Continue generating tokens while using & saving the cache, without re-reading the model - // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' -> - // -> ... - mockIC.run_test( - std::string(__func__) + ": " + "run_update for chat " + - std::to_string(i) + " continued - " + model, - run_update, - "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000", - "", silent_on_trap, my_principal); + if (i == 0) { + // ----------------------------------------------------------------------------- + // Without log file + // Start a new chat, which will reset the prompt-cache file + // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' -> + // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' + mockIC.run_test( + std::string(__func__) + ": " + "new_chat " + std::to_string(i) + + " - " + model, + new_chat, + "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865", + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000", + silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Generate tokens from prompt while saving everything to cache, + // without re-reading the model ! + // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' + // -> ... + mockIC.run_test( + std::string(__func__) + ": " + "run_update for chat " + + std::to_string(i) + " - " + model, + run_update, + "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573", + "", silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Continue generating tokens while using & saving the cache, without re-reading the model + // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' -> + // -> ... + mockIC.run_test( + std::string(__func__) + ": " + "run_update for chat " + + std::to_string(i) + " continued - " + model, + run_update, + "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000", + "", silent_on_trap, my_principal); + + } else { + // ----------------------------------------------------------------------------- + // With log file + // Start a new chat, which will reset both the prompt-cache and log-file files + // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"} })' -> + // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' + mockIC.run_test( + std::string(__func__) + ": " + "new_chat " + std::to_string(i) + + " - " + model, + new_chat, + "4449444c026c01dd9ad28304016d710100040a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865", + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000", + silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Generate tokens from prompt while saving everything to cache, + // without re-reading the model ! + // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' + // -> ... + mockIC.run_test( + std::string(__func__) + ": " + "run_update for chat " + + std::to_string(i) + " - " + model, + run_update, + "4449444c026c01dd9ad28304016d7101000f0a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573", + "", silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Continue generating tokens while using & saving the cache, without re-reading the model + // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' -> + // -> ... + mockIC.run_test( + std::string(__func__) + ": " + "run_update for chat " + + std::to_string(i) + " continued - " + model, + run_update, + "4449444c026c01dd9ad28304016d7101000f0a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000", + "", silent_on_trap, my_principal); + } // ----------------------------------------------------------------------------- // Remove the prompt-cache file if it exists @@ -139,6 +179,24 @@ void test_tiny_stories(MockIC &mockIC) { "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865", "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100850143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368652064656c65746564207375636365737366756c6c790000c8000000", silent_on_trap, my_principal); + + // ----------------------------------------------------------------------------- + // Remove the log-file file if it exists + // '(record { args = vec {"--log-file"; "main.log"} })' -> response + std::string response; + if (i == 1) { + // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' + response = + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000"; + } else { + response = + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000"; + } + mockIC.run_test( + std::string(__func__) + ": " + "remove_log_file " + model, + remove_log_file, + "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67", + response, silent_on_trap, my_principal); } } } \ No newline at end of file diff --git a/src/llama_cpp.did b/src/llama_cpp.did index 6c14fef..6d240f9 100644 --- a/src/llama_cpp.did +++ b/src/llama_cpp.did @@ -120,6 +120,7 @@ service : { run_query : (InputRecord) -> (OutputRecordResult) query; run_update : (InputRecord) -> (OutputRecordResult); remove_prompt_cache : (InputRecord) -> (OutputRecordResult); + remove_log_file : (InputRecord) -> (OutputRecordResult); // Chats retrieval get_chats : () -> (GetChatsRecordResult) query; diff --git a/src/main_.cpp b/src/main_.cpp index 21495d0..b61b793 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -1128,6 +1128,9 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only common_sampler_free(smpl); // ICPP-PATCH-START + // Close log file and reset pointers, so next call will start fresh, with or without logging + common_log_set_file(common_log_main(), nullptr); + // Reset all static memory we do not want to carry over to the next update call reset_static_memory(); // ICPP-PATCH-END diff --git a/src/model.cpp b/src/model.cpp index ddc9ec1..27ecee3 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -11,6 +11,7 @@ #include "arg.h" #include "common.h" +#include "log.h" #include #include diff --git a/src/run.cpp b/src/run.cpp index 79fa8d8..2183837 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -8,6 +8,7 @@ #include "utils.h" #include "arg.h" +#include "log.h" #include #include @@ -45,33 +46,38 @@ void new_chat() { CandidTypePrincipal caller = ic_api.get_caller(); std::string principal_id = caller.get_text(); - auto [argc, argv, args] = get_args_for_main(ic_api); - - // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation - // Get the cache filename from --prompt-cache in args - common_params params; - if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, - print_usage)) { - error_msg = "Cannot parse args."; + // ----------------------------------------------------------- + // Create a new file to save this chat for this prinicipal + if (!db_chats_new(principal_id, error_msg)) { send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); return; } - // Create a new file to save this chat for this prinicipal - if (!db_chats_new(principal_id, error_msg)) { + // Each principal can only save N chats + if (!db_chats_clean(principal_id, error_msg)) { send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); return; } - // Each principal can only save N chats - if (!db_chats_clean(principal_id, error_msg)) { + // ----------------------------------------------------------- + // Parse the arguments + auto [argc, argv, args] = get_args_for_main(ic_api); + + // (-) gets the cache filename from --prompt-cache in args + // (-) opens log file from --log-file in args + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { + error_msg = "Cannot parse args."; send_output_record_result_error_to_wire( ic_api, Http::StatusCode::InternalServerError, error_msg); return; } + // ----------------------------------------------------------- + // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation // Each principal has their own cache folder std::string path_session = params.path_prompt_cache; std::string canister_path_session; @@ -111,6 +117,20 @@ void new_chat() { // Simpler message back to the wire msg = "Ready to start a new chat for cache file " + path_session; + // ----------------------------------------------------------- + // If --log-file is provided, the file was opened by common_params_parse + // Was it already closed, and common_log_main() does not work anymore??? + // If so, then store --log-file value in params.log_file, and delete it here + // If not, then get the file handle from common_log_main() and empty the file + // + // When running native, the log file is only closed at the end... + // it is opened multiple times. Does that work OK ? + + // When running in the IC, the log file is ???? + + std::cout << "TODO"; + + // ----------------------------------------------------------- // Return output over the wire CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200 @@ -147,20 +167,6 @@ void remove_prompt_cache() { return; } - // // Create a new file to save this chat for this prinicipal - // if (!db_chats_new(principal_id, error_msg)) { - // send_output_record_result_error_to_wire( - // ic_api, Http::StatusCode::InternalServerError, error_msg); - // return; - // } - - // // Each principal can only save N chats - // if (!db_chats_clean(principal_id, error_msg)) { - // send_output_record_result_error_to_wire( - // ic_api, Http::StatusCode::InternalServerError, error_msg); - // return; - // } - // Each principal has their own cache folder std::string path_session = params.path_prompt_cache; std::string canister_path_session; @@ -207,6 +213,48 @@ void remove_prompt_cache() { ic_api.to_wire(CandidTypeVariant{"Ok", r_out}); } +void remove_log_file() { + IC_API ic_api(CanisterUpdate{std::string(__func__)}, false); + std::string error_msg; + if (!is_caller_whitelisted(ic_api, false)) { + error_msg = "Access Denied."; + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::Unauthorized, error_msg); + return; + } + + auto [argc, argv, args] = get_args_for_main(ic_api); + + // Process the args, which will instantiate the log singleton + common_params params; + if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, + print_usage)) { + error_msg = "Cannot parse args."; + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, error_msg); + return; + } + + // Now we can remove the log file + std::string msg; + bool success = common_log_remove_file(common_log_main(), msg); + if (!success) { + send_output_record_result_error_to_wire( + ic_api, Http::StatusCode::InternalServerError, msg); + return; + } + + // Return output over the wire + CandidTypeRecord r_out; + r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200 + r_out.append("conversation", CandidTypeText{""}); + r_out.append("output", CandidTypeText{msg}); + r_out.append("error", CandidTypeText{""}); + r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("generated_eog", CandidTypeBool{false}); + ic_api.to_wire(CandidTypeVariant{"Ok", r_out}); +} + void run(IC_API &ic_api, const uint64_t &max_tokens) { std::string error_msg; if (!is_caller_whitelisted(ic_api, false)) { diff --git a/src/run.h b/src/run.h index 331ff00..9574865 100644 --- a/src/run.h +++ b/src/run.h @@ -8,6 +8,7 @@ void run_query() WASM_SYMBOL_EXPORTED("canister_query run_query"); void run_update() WASM_SYMBOL_EXPORTED("canister_update run_update"); void remove_prompt_cache() WASM_SYMBOL_EXPORTED("canister_update remove_prompt_cache"); +void remove_log_file() WASM_SYMBOL_EXPORTED("canister_update remove_log_file"); bool get_canister_path_session(const std::string &path_session, const std::string &principal_id, From 9c0fc1f3057d8acc618d187b6e40aa74c5c15077 Mon Sep 17 00:00:00 2001 From: icpp Date: Sat, 1 Feb 2025 09:57:17 -0500 Subject: [PATCH 19/25] Update native & pytests --- native/test_qwen2.cpp | 6 +++--- native/test_tiny_stories.cpp | 15 ++++----------- test/test_qwen2.py | 10 ++++++++++ test/test_tiny_stories.py | 10 ++++++++++ 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp index a7b99f2..d48dc23 100644 --- a/native/test_qwen2.cpp +++ b/native/test_qwen2.cpp @@ -165,11 +165,11 @@ void test_qwen2(MockIC &mockIC) { // ----------------------------------------------------------------------------- // Remove the log-file file if it exists // '(record { args = vec {"--log-file"; "main.log"} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "Cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/main.log deleted successfully"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' + // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' mockIC.run_test( std::string(__func__) + ": " + "remove_log_file " + model, - remove_prompt_cache, + remove_log_file, "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67", - "4449444c026b01bc8a01016c06819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e010000810143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f6d61696e2e6c6f672064656c65746564207375636365737366756c6c790000c8000000", + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000", silent_on_trap, my_principal); } \ No newline at end of file diff --git a/native/test_tiny_stories.cpp b/native/test_tiny_stories.cpp index 5e3408b..528a1af 100644 --- a/native/test_tiny_stories.cpp +++ b/native/test_tiny_stories.cpp @@ -182,21 +182,14 @@ void test_tiny_stories(MockIC &mockIC) { // ----------------------------------------------------------------------------- // Remove the log-file file if it exists - // '(record { args = vec {"--log-file"; "main.log"} })' -> response - std::string response; - if (i == 1) { - // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' - response = - "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000"; - } else { - response = - "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000"; - } + // '(record { args = vec {"--log-file"; "main.log"} })' -> + // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })' mockIC.run_test( std::string(__func__) + ": " + "remove_log_file " + model, remove_log_file, "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67", - response, silent_on_trap, my_principal); + "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000", + silent_on_trap, my_principal); } } } \ No newline at end of file diff --git a/test/test_qwen2.py b/test/test_qwen2.py index 280516e..d593a24 100644 --- a/test/test_qwen2.py +++ b/test/test_qwen2.py @@ -121,4 +121,14 @@ def test__remove_prompt_cache(network: str) -> None: canister_argument='(record { args = vec {"--prompt-cache"; "prompt.cache"} })', network=network, ) + assert "(variant { Ok" in response + +def test__remove_log_file(network: str) -> None: + response = call_canister_api( + dfx_json_path=DFX_JSON_PATH, + canister_name=CANISTER_NAME, + canister_method="remove_log_file", + canister_argument='(record { args = vec {"--log-file"; "main.log"} })', + network=network, + ) assert "(variant { Ok" in response \ No newline at end of file diff --git a/test/test_tiny_stories.py b/test/test_tiny_stories.py index 838fa21..c17fbc6 100644 --- a/test/test_tiny_stories.py +++ b/test/test_tiny_stories.py @@ -141,4 +141,14 @@ def test__remove_prompt_cache(network: str) -> None: canister_argument='(record { args = vec {"--prompt-cache"; "prompt.cache"} })', network=network, ) + assert "(variant { Ok" in response + +def test__remove_log_file(network: str) -> None: + response = call_canister_api( + dfx_json_path=DFX_JSON_PATH, + canister_name=CANISTER_NAME, + canister_method="remove_log_file", + canister_argument='(record { args = vec {"--log-file"; "main.log"} })', + network=network, + ) assert "(variant { Ok" in response \ No newline at end of file From 4f98e55fa7609deb5840d6d1aa9d13f70d9e966e Mon Sep 17 00:00:00 2001 From: icpp Date: Sat, 1 Feb 2025 10:00:43 -0500 Subject: [PATCH 20/25] CI/CD - use different branch while working on upgrade --- .github/workflows/cicd-mac.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-mac.yml b/.github/workflows/cicd-mac.yml index 068c29e..8fc7bc8 100644 --- a/.github/workflows/cicd-mac.yml +++ b/.github/workflows/cicd-mac.yml @@ -39,7 +39,8 @@ jobs: uses: actions/checkout@v4 with: repository: onicai/llama_cpp_onicai_fork - ref: onicai # Specify the branch name here + # ref: onicai # Specify the branch name here + ref: onicai-615212 # While working on the upgrade... path: src/llama_cpp_onicai_fork fetch-depth: 1 # Get just the last commit submodules: 'recursive' From 0280511e7d168f5e8a94b95da23ef97ff668a964 Mon Sep 17 00:00:00 2001 From: icpp Date: Sat, 1 Feb 2025 13:17:30 -0500 Subject: [PATCH 21/25] format include --- src/main_.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main_.cpp b/src/main_.cpp index b61b793..bc7ebca 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include From 30a6b8cc838e56b7026e480dab289d96dff2f103 Mon Sep 17 00:00:00 2001 From: icpp Date: Sun, 2 Feb 2025 09:07:08 -0500 Subject: [PATCH 22/25] Update READMEs --- README-0002-615212.md | 40 ++---------------------------------- README-contributors-guide.md | 10 ++++----- README.md | 3 ++- 3 files changed, 9 insertions(+), 44 deletions(-) diff --git a/README-0002-615212.md b/README-0002-615212.md index 1b7bf47..2e352a1 100644 --- a/README-0002-615212.md +++ b/README-0002-615212.md @@ -326,45 +326,13 @@ No updates needed for icpp-pro - #define GGML_DEFAULT_N_THREADS 1 ------------ -TODO: search in code files for: TODO-615212 - -(-) main_.cpp has a new static `global g_smpl`: - static common_sampler ** g_smpl; - - Q: Does this need to become a global variable, accessible from common.cpp ? - Like we did for g_model ? - - In `common/common.cpp` we added: - ``` - // ICPP-PATCH-START - #include "ic_api.h" - extern llama_model ** g_model; // The global variable from main_.cpp - // ICPP-PATCH-END - ``` - -(-) main_.cpp renamed type for `g_params`: - from: static gpt_params * g_params; - to : static common_params * g_params; - - Q: Does this need to become a global variable, accessible from common.cpp ? - Like we did for g_model ? - -(-) main_.cpp line 142: common_sampler * smpl = nullptr; - - Q: Does `smpl` need to become a static variable, like `model` & `ctx` ? - -(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going - - Q: Is this logic for prompt_remaining still valid? +TODOs: (-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT -> LOG is used just for Console/Stream Output -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes... - Q1: Did we change anything to LOG & LOG_TEE to get it to work ? - Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it. - Q3: Can we remove the LOG & LOG_TEE - Q4: Do we need to update the README about downloading different LOG files? + Q4: Update the README about downloading different LOG files? (-) llama-vocab.cpp --- This function is no longer there. Is tinystories still working? @@ -379,10 +347,6 @@ TODO: search in code files for: TODO-615212 } ``` -(-) TODO: `llama_cpp_onicai_fork/common/log.cpp` step through the logic - - Remove the pause() function - - Remove the cur.is_end function ? - (-) TODO: Monitor memory, and make sure that ctx is freed up... See free_ctx() method that has been outcommented in main_.cpp diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 16415b0..504ef1d 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -34,11 +34,11 @@ Take following steps locally: - These are the git-sha values of the llama.cpp versions we branched from: - | upgrade # | llama.cpp sha | llama.cpp release-tag | - | --------- | ------------- | --------------------- | - | 0000 | 5cdb37 | - | - | 0001 | b841d0 | - | - | 0002 | 615212 | b4532 | + | upgrade # | llama.cpp sha | llama.cpp release-tag | date | + | --------- | ------------- | --------------------- | ---------- | + | 0000 | 5cdb37 | - | - | + | 0001 | b841d0 | - | - | + | 0002 | 615212 | b4532 | Feb 2 '25 | - Start with a fresh clone of llama_cpp_onicai_fork: diff --git a/README.md b/README.md index b18ca50..640914f 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,8 @@ The build of the wasm must be done on a `Mac` ! - Upload gguf file The canister is now up & running, and ready to be loaded with a gguf file. In - this example we use the powerful `qwen2.5-0.5b-instruct-q8_0.gguf` model. + this example we use the powerful `qwen2.5-0.5b-instruct-q8_0.gguf` model, but + you can use any model availabe in gguf format. - Download the model from huggingface: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF From 81e00f04c1506a385c53da79f54f07981740fd55 Mon Sep 17 00:00:00 2001 From: icpp Date: Sun, 2 Feb 2025 09:14:43 -0500 Subject: [PATCH 23/25] Update table of llama.cpp upgrades --- README-contributors-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 504ef1d..3cbed06 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -36,9 +36,9 @@ Take following steps locally: | upgrade # | llama.cpp sha | llama.cpp release-tag | date | | --------- | ------------- | --------------------- | ---------- | - | 0000 | 5cdb37 | - | - | - | 0001 | b841d0 | - | - | | 0002 | 615212 | b4532 | Feb 2 '25 | + | 0001 | b841d0 | - | Oct 18 '24 | + | 0000 | 5cdb37 | - | Jul 21 '24 | - Start with a fresh clone of llama_cpp_onicai_fork: From 5228f7682340b76c4e40a9deae4cec9ded262730 Mon Sep 17 00:00:00 2001 From: icpp Date: Sun, 2 Feb 2025 09:28:43 -0500 Subject: [PATCH 24/25] Running LLMs on-chain solves your cybersecurity problem --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 640914f..3c35809 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,11 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) -`llama_cpp_canister` allows you to deploy [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) as a Smart Contract on the Internet Computer. +`llama_cpp_canister` allows you to deploy [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) as a Smart Contract on the Internet Computer, +and run an LLM on-chain as the brain for your on-chain AI Agents. +- Run any LLM on-chain via the gguf format πŸ”₯ +- Solves your cybersecurity problem πŸ” - MIT open source πŸ§‘β€πŸ’» - Well documented πŸ“ - Fully QA'd via CI/CD βœ… @@ -16,15 +19,12 @@ # Try it out -You can try out a deployed version at https://icgpt.onicai.com +You can try out a variety of fully on-chain LLMs at https://icgpt.onicai.com -# Need help? +# Need help or have feedback? ❀️ -If you decide to use llama_cpp_canister in your ICP dApp, we want to help you. - -We do NOT consider llama_cpp_canister "our IP". It is for the broad benefit of DeAI on ICP, and we hope many of you will try it out and use it. - -Please join our [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aaaar-ar7uq-cai/?ref=6e3y2-4yaaa-aaaaf-araya-cai) for any questions, discussions or feedback. ❀️ +- [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aaaar-ar7uq-cai/?ref=6e3y2-4yaaa-aaaaf-araya-cai) +- [Forum: Llama.cpp on the Internet Computer](https://forum.dfinity.org/t/llama-cpp-on-the-internet-computer/33471?u=icpp) # Capabilities πŸ”₯ From 3ff73fafbe720d882eb76aba782bb599a9dd6d47 Mon Sep 17 00:00:00 2001 From: icpp Date: Sun, 2 Feb 2025 09:59:30 -0500 Subject: [PATCH 25/25] Upgrade to llama.cpp sha 615212 All done... --- .github/workflows/cicd-mac.yml | 4 ++-- README-contributors-guide.md | 7 ++++++- README.md | 6 +++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-mac.yml b/.github/workflows/cicd-mac.yml index 8fc7bc8..c383315 100644 --- a/.github/workflows/cicd-mac.yml +++ b/.github/workflows/cicd-mac.yml @@ -39,8 +39,8 @@ jobs: uses: actions/checkout@v4 with: repository: onicai/llama_cpp_onicai_fork - # ref: onicai # Specify the branch name here - ref: onicai-615212 # While working on the upgrade... + ref: onicai # Specify the branch name here + # ref: onicai-615212 # While working on the upgrade... path: src/llama_cpp_onicai_fork fetch-depth: 1 # Get just the last commit submodules: 'recursive' diff --git a/README-contributors-guide.md b/README-contributors-guide.md index 3cbed06..7c533b7 100644 --- a/README-contributors-guide.md +++ b/README-contributors-guide.md @@ -84,10 +84,15 @@ We need to rethink this logic, but for now it is ok... Do NOT merge the `onicai-` branch into the `onicai` branch, but replace it: ``` +# do the onicai branch management while master branch is checked out +git checkout master git branch -m onicai onicai- git branch -m onicai- onicai -git push origin onicai:onicai +git push --force origin onicai:onicai git push origin onicai-:onicai- +# +# Switch to the onicai branch, which now contains the version +git checkout onicai ``` ## llama_cpp_canister diff --git a/README.md b/README.md index 3c35809..b5d0d50 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,12 @@ You can try out a variety of fully on-chain LLMs at https://icgpt.onicai.com # Capabilities πŸ”₯ - Deploy any LLM available as a gguf file. -- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)). + *(The model must be able to produce at least 1 token per update call)* + +- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)). + + # Set up The build of the wasm must be done on a `Mac` !