From 5588beb6d8846d1e1da3fed3fb71902272eaf9ee Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Thu, 23 Jan 2025 11:06:14 -0500
Subject: [PATCH 01/25] Update to llama.cpp sha 615212

main_.cpp
---
 src/main_.cpp | 629 +++++++++++++++++++++++---------------------------
 1 file changed, 288 insertions(+), 341 deletions(-)
diff --git a/src/main_.cpp b/src/main_.cpp
index 29f781a..ae3b24d 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -5,15 +5,14 @@
 #include "utils.h"
 #include "main_.h"
 // ICPP-PATCH-END
-
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "log.h"
+#include "sampling.h"
 #include "llama.h"
+#include "chat-template.hpp"
 
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -39,16 +38,28 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
+
 static llama_context           ** g_ctx;
 // static llama_model             ** g_model; // Make this a global variable, accessible from common.cpp
 llama_model             ** g_model;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
+}
+
 static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
@@ -61,61 +72,22 @@ static bool file_is_empty(const std::string & path) {
     return f.tellg() == 0;
 }
 
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 //icpp-start NO CONSOLE
 // #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 // static void sigint_handler(int signo) {
 //     if (signo == SIGINT) {
 //         if (!is_interacting && g_params->interactive) {
-//             is_interacting = true;
+//             is_interacting  = true;
+//             need_insert_eot = true;
 //         } else {
 //             console::cleanup();
-//             printf("\n");
-//             llama_print_timings(*g_ctx);
-//             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+//             LOG("\n");
+//             common_perf_print(*g_ctx, *g_smpl);
+// 
+//             // make sure all logs are flushed
+//             LOG("Interrupted by user\n");
+//             common_log_pause(common_log_main());
+// 
 //             _exit(130);
 //         }
 //     }
@@ -123,49 +95,26 @@ static void write_logfile(
 // #endif
 //icpp-end NO CONSOLE
 
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
-    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(
-        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
-    chat_msgs.push_back({role, content});
-    return formatted;
-}
-
 int main_(int argc, char ** argv, std::string principal_id, bool load_model_only, std::string &icpp_error_msg, std::ostringstream &conversation_ss, std::ostringstream &output_ss, const uint64_t &max_tokens, std::string &prompt_remaining, bool &generated_eog) {
     std::cout << std::string(__func__) << " Called with following arguments: " << std::endl;
     std::cout << "- principal_id    = " << principal_id << std::endl;
     std::cout << "- load_model_only = " << load_model_only << std::endl;
     std::cout << "- max_tokens      = " << max_tokens << std::endl;
 
-    gpt_params params;
+    common_params params;
 
     g_params = &params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         // ICPP-PATCH-START
-        icpp_error_msg = "Error in gpt_params_print_usage.";
+        icpp_error_msg = "Error in common_params_parse.";
         // ICPP-PATCH-END
         return 1;
     }
 
-    llama_sampling_params & sparams = params.sparams;
+    common_init();
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+    auto & sparams = params.sampling;
 
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
@@ -173,53 +122,42 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     //icpp-patch atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
-    }
-
-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_INF("%s: llama backend init\n", __func__);
 
-    LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
     static llama_model * model; // ICPP-PATCH: use static to preserve accross calls
     static llama_context * ctx; // ICPP-PATCH: use static to preserve accross calls
-    llama_context * ctx_guidance = NULL;
-    std::vector<llama_chat_msg> chat_msgs;
+    common_sampler * smpl = nullptr;
 
     // ICPP-PATCH-START
     // Don't give error if embd_inp = session_tokens. All is OK to just keep going
@@ -238,17 +176,19 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
     g_model = &model;
     g_ctx = &ctx;
+    g_smpl = &smpl;
+
+    std::vector<common_chat_msg> chat_msgs;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    common_init_result llama_init = common_init_from_params(params);
+
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: error: unable to load model\n", __func__);
         // ICPP-PATCH-START
         icpp_error_msg = std::format("{}: error: unable to load model)", __func__);
         // ICPP-PATCH-END
@@ -264,28 +204,81 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     }
     // ICPP-PATCH-END
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    // ICPP-TODO-START: This section is completely new...
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
+
+    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
+    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    set_process_priority(params.cpuparams.priority);
+
+    struct ggml_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            return 1;
+        }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
+    if (!threadpool) {
+        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        return 1;
+    }
+
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+    // ICPP-TODO-END
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+    }
+
+    // auto enable conversation mode if chat template is available
+    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
+        if (has_chat_template) {
+            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        } else {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    }
+
+    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
+    if (params.conversation_mode && !has_chat_template) {
+        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
     }
 
     // print chat template example in conversation mode
-    if (params.conversation) {
+    if (params.conversation_mode) {
         if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
         } else {
-            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
 
     std::string path_session = params.path_prompt_cache;
@@ -299,45 +292,56 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     // ICPP-PATCH-END
     std::vector<llama_token> session_tokens;
 
-    if (!path_session.empty()) {    
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+    if (!path_session.empty()) {
+        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
         if (!file_exists(path_session)) {
-            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+            LOG_INF("%s: session file does not exist, will create.\n", __func__);
         } else if (file_is_empty(path_session)) {
-            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
         } else {
             // The file exists and is not empty
             session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
                 // ICPP-PATCH-START
                 icpp_error_msg = std::format("{}: error: failed to load session file '{}')", __func__, path_session.c_str());
                 // ICPP-PATCH-END
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
         }
     }
 
-    const bool add_bos = llama_should_add_bos_token(model);
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
     if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(llama_add_eos_token(model) != 1);
+        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
     }
-    LOG("add_bos: %d\n", add_bos);
+
+    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
 
     std::vector<llama_token> embd_inp;
 
+    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
+        common_chat_msg new_msg{role, content};
+        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back({role, content});
+        LOG_DBG("formatted: '%s'\n", formatted.c_str());
+        return formatted;
+    };
+
     {
-        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+        auto prompt = (params.conversation_mode && params.enable_chat_template)
+            // format the system prompt in conversation mode (fallback to default if empty)
+            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            // otherwise use the prompt as is
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-            LOG_TEE("tokenize the prompt\n");
-            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+            LOG_DBG("tokenize the prompt\n");
+            embd_inp = common_tokenize(ctx, prompt, true, true);
         } else {
-            LOG_TEE("use session tokens\n");
+            LOG_DBG("use session tokens\n");
             embd_inp = session_tokens;
             // ICPP-PATCH-START
             embd_inp_is_session_tokens = true;
@@ -350,9 +354,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         }
         // ICPP-PATCH-END
 
-        LOG_TEE("prompt: \"%s\"\n", log_tostr(prompt));
-        LOG_TEE("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-        LOG_TEE("# tokens: %s\n", std::to_string(embd_inp.size()).c_str());
+        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     // Should not run without any tokens
@@ -367,34 +370,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     }
 
     // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
     // ICPP-PATCH-START
     // when the prompt is empty, then embd_inp = session_tokens, and all is OK to just keep going.
     if (!embd_inp_is_session_tokens) {
     // ICPP-PATCH-END
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         // ICPP-PATCH-START
         icpp_error_msg = std::format("{}: error: prompt is too long ({} tokens, max {})", __func__, (int) embd_inp.size(), n_ctx - 4);
         // ICPP-PATCH-END
-        return 1; 
+        return 1;
     }
     // ICPP-PATCH-START
     }
@@ -411,29 +396,28 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
             n_matching_session_tokens++;
         }
         if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
+            LOG_INF("%s: using full prompt from session file\n", __func__);
         } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
         llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
 
     // if we will use the cache for the full prompt without reaching the end of the cache, force
     // reevaluation of the last token to recalculate the cached logits
     if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
 
         session_tokens.resize(embd_inp.size() - 1);
     }
@@ -445,7 +429,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         params.n_keep += add_bos; // always keep the BOS token
     }
 
-    if (params.conversation) {
+    if (params.conversation_mode) {
         params.interactive_first = true;
     }
 
@@ -455,30 +439,20 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > add_bos) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+            LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG_CNT("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     // ctrl+C handling
@@ -500,47 +474,56 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 //icpp-patch-end
 
     if (params.interactive) {
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    auto tmp = common_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
         }
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    smpl = common_sampler_init(model, sparams);
+    if (!smpl) {
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        return 1;
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
     // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -554,9 +537,9 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
       //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
       //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
     }
-    LOG_TEE("\n\n");
+    LOG_INF("\n");
 
     if (params.interactive) {
         const char * control_message;
@@ -568,11 +551,15 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG_INF(       "%s", control_message);
+        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
+            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        }
+        LOG_INF("\n");
 
         is_interacting = params.interactive_first;
     }
@@ -586,7 +573,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;
-    int n_past_guidance    = 0;
 
     // ICPP-PATCH-START
     // We can only handle max_tokens evaluations per call
@@ -612,28 +598,21 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     display = params.display_prompt;
 
     std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
 
     // tokenized antiprompts
     std::vector<std::vector<llama_token>> antiprompt_ids;
 
     antiprompt_ids.reserve(params.antiprompt.size());
     for (const std::string & antiprompt : params.antiprompt) {
-        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
-    }
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
+        antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
     }
 
     if (llama_model_has_encoder(model)) {
         int enc_input_size = embd_inp.size();
         llama_token * enc_input_buf = embd_inp.data();
 
-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
             // ICPP-PATCH-START
             icpp_error_msg = std::format("{}: error: failed to eval (-1-)", __func__);
             // ICPP-PATCH-END
@@ -641,8 +620,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         }
 
         llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
         }
 
         embd_inp.clear();
@@ -662,9 +641,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 embd.resize(max_embd_size);
 
                 // console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 // console::set_display(console::reset);
-                fflush(stdout);
             }
 
             if (ga_n == 1) {
@@ -672,16 +650,22 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 // if we run out of context:
                 // - take the n_keep first tokens from the original prompt (via n_past)
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
+
+                if (n_past + (int) embd.size() >= n_ctx) {
+                    if (!params.ctx_shift){
+                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                        break;
+                    }
+
                     if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                         break;
                     }
 
                     const int n_left    = n_past - params.n_keep;
                     const int n_discard = n_left/2;
 
-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                     llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
@@ -689,15 +673,11 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
                     n_past -= n_discard;
 
-                    if (ctx_guidance) {
-                        n_past_guidance -= n_discard;
-                    }
-
-                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
-                    LOG("clear session path\n");
+                    LOG_DBG("clear session path\n");
                     path_session.clear();
                 }
             } else {
@@ -707,10 +687,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                     const int bd = (ga_w/ga_n)*(ga_n - 1);
                     const int dd = (ga_w/ga_n) - ib*bd - ga_w;
 
-                    LOG("\n");
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG_DBG("\n");
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
                     llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                     llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -720,7 +700,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
                     ga_i += ga_w/ga_n;
 
-                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                 }
             }
 
@@ -757,49 +737,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 }
             }
 
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        // ICPP-PATCH-START
-                        icpp_error_msg = std::format("{}: error: failed to eval (-2-)", __func__);
-                        // ICPP-PATCH-END
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
@@ -813,10 +750,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 }
                 // ICPP-PATCH-END
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     // ICPP-PATCH-START
                     icpp_error_msg = std::format("{}: error: failed to eval (-3-)", __func__);
                     // ICPP-PATCH-END
@@ -825,17 +762,17 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
                 // Display total tokens alongside total time
                 if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                 }
 
                 // ICPP-PATCH-START
                 // Keep track of the processed conversation tokens and the remaining prompt
                 for (int j=0; j<n_eval; ++j){
                     int id = embd[i+j];
-                    const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                    const std::string token_str = common_token_to_piece(ctx, id, params.special);
                     conversation_ss << token_str;
 
                     // if (prompt_remaining.find(token_str) == 0) {
@@ -876,7 +813,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         // ICPP-PATCH-END
 
         embd.clear();
-        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // optionally save the session on first sample (for faster prompt loading next time)
@@ -887,14 +823,14 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 need_to_save_session = false;
                 llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
 
-                LOG("saved session to %s\n", path_session.c_str());
+                LOG_DBG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
-            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+            common_sampler_accept(smpl, id, /* accept_grammar= */ true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -904,16 +840,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG_TEE("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -938,7 +874,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         int n_prompt_tokens_remaining = 0;
         size_t iii = 0;
         for (auto id : embd_inp) {
-            const std::string token_str = llama_token_to_piece(ctx, id, true); // include special tokens
+            const std::string token_str = common_token_to_piece(ctx, id, true); // include special tokens
             if (iii < n_consumed) {
                 prompt_consumed += token_str;
             } else {
@@ -954,10 +890,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);
 
                 // Console/Stream Output
-                fprintf(stdout, "%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
 
                 // Record Displayed Tokens To Log
                 // Note: Generated tokens are created one by one hence this check
@@ -969,8 +905,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
-
-                fflush(stdout);
             }
         }
 
@@ -985,7 +919,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
             // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
                 const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -1007,7 +941,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 }
 
                 // check for reverse prompt using special tokens
-                llama_token last_token = llama_sampling_last(ctx_sampling);
+                llama_token last_token = common_sampler_last(smpl);
                 for (std::vector<llama_token> ids : antiprompt_ids) {
                     if (ids.size() == 1 && last_token == ids[0]) {
                         if (params.interactive) {
@@ -1019,52 +953,52 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 }
 
                 if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
+                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
                 }
             }
 
             // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
-                LOG("found an EOG token\n");
+            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+                LOG_DBG("found an EOG token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
 
                     if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format("assistant", assistant_ss.str());
                     }
                     is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                 }
             }
 
             // if current token is not EOG, we add it to current assistant message
-            if (params.conversation) {
-                auto id = llama_sampling_last(ctx_sampling);
-                assistant_ss << llama_token_to_piece(ctx, id, false);
+            if (params.conversation_mode) {
+                const auto id = common_sampler_last(smpl);
+                assistant_ss << common_token_to_piece(ctx, id, false);
             }
 
             if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
 
-                if (params.conversation) {
-                    printf("\n> ");
+                if (params.conversation_mode) {
+                    LOG("\n> ");
                 }
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    LOG_DBG("adding input prefix BOS token\n");
+                    embd_inp.push_back(llama_vocab_bos(vocab));
                 }
 
                 std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
+                if (!params.input_prefix.empty() && !params.conversation_mode) {
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("%s", params.input_prefix.c_str());
                 }
 
                 // color user input only
@@ -1086,12 +1020,12 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                 // Entering a empty line lets the user pass control back
                 if (buffer.length() > 1) {
                     // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        printf("%s", params.input_suffix.c_str());
+                    if (!params.input_suffix.empty() && !params.conversation_mode) {
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
@@ -1099,21 +1033,21 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                         string_process_escapes(buffer);
                     }
 
-                    bool format_chat = params.conversation && params.enable_chat_template;
+                    bool format_chat = params.conversation_mode && params.enable_chat_template;
                     std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format("user", std::move(buffer))
                         : std::move(buffer);
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = common_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
 
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     // if user stop generation mid-way, we must add EOT to finish model's last response
                     if (need_insert_eot && format_chat) {
-                        llama_token eot = llama_token_eot(model);
-                        embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
+                        llama_token eot = llama_vocab_eot(vocab);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
                         need_insert_eot = false;
                     }
 
@@ -1124,16 +1058,16 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
 
                     // reset assistant message
                     assistant_ss.str("");
 
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -1141,7 +1075,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
@@ -1154,8 +1088,8 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         // ICPP-PATCH-END
 
         // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
-            LOG_TEE(" [end of text]\n");
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
+            LOG(" [end of text]\n");
             // break;  // we do not break the loop here, but we do it above
             //           once the eog token has been decoded and added to conversation_ss & session_tokens
             // ICPP-PATCH-START 
@@ -1179,17 +1113,19 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         // ICPP-PATCH-START
         std::cout << "\nSaving " << std::to_string(session_tokens.size()) << " tokens to session file " << path_session << std::endl;
         // ICPP-PATCH-END
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    LOG("\n\n");
+    common_perf_print(ctx, smpl);
 
-    if (ctx_guidance) { llama_free(ctx_guidance); }
+    common_sampler_free(smpl);
 
     // ICPP-PATCH-START
 
+    // TODO-615212 -- This is old code that we had outcommented
+    //                REMOVE
     // Do NOT free ctx & model storage
     // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM
     // -> we do NOT reset g_ctx & g_model
@@ -1197,22 +1133,28 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     // llama_free(ctx);
     // llama_free_model(model);
 
+    // TODO-615212 -- Make sure this is correct
+    //                LEAVE IT IN
     // Do reset all other static memory
     reset_static_memory();
     // ICPP-PATCH-END
 
-    llama_sampling_free(ctx_sampling);
+    // TODO-615212 -- Make sure this is now handled in common_sampler_free
+    //                REMOVE
+    // llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);
 
     return 0;
 }
 
 // ICPP-PATCH-START: 
 // functions added for running on IC
+
+// TODO-615212 -- Make sure this is now handled in common_sampler_free
+//                REMOVE
 void free_ctx() {
     if (g_ctx && *g_ctx) {
         llama_free(*g_ctx);
@@ -1220,6 +1162,9 @@ void free_ctx() {
         g_ctx = nullptr;
     }
 }
+
+// TODO-615212 -- Make sure this is correct
+//                LEAVE IT IN
 void free_model() {
     if (g_model && *g_model) {
         llama_free_model(*g_model);
@@ -1227,6 +1172,8 @@ void free_model() {
         g_model = nullptr;
     }
 }
+// TODO-615212 -- Make sure this is correct
+//                LEAVE IT IN
 void reset_static_memory() {
     // Tip: to find what must be reset, use a native debug build and stop here 
     //      in vscode. Then check the static memory section in VARIABLES.

From 3c82921abdbcc0a03e723075d6120d3dfbddf062 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Thu, 23 Jan 2025 21:01:08 -0500
Subject: [PATCH 02/25] build-native : files compile

---
 .gitignore                   |   2 +-
 README-contributors-guide.md | 282 ++++++++++++++++++++++++++++-------
 README.md                    |   3 +-
 icpp.toml                    |   6 +-
 src/main_.cpp                |  11 +-
 src/run.cpp                  |  15 +-
 6 files changed, 249 insertions(+), 70 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6a6ae91..da96b82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 # Misc
-llama_cpp_onicai_fork
+llama_cpp_onicai_fork*
 *.code-workspace
 x
 y
diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index ef460ba..0c5f74e 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -103,13 +103,28 @@ git push origin --tags
 Take following steps locally:
 - git fetch 
 
-- Copy `src/llama_cpp_onicai_fork` to `<temp>/llama_cpp_onica_fork_<git-sha>`
-
-  This is just as a reference. We will remove this folder once all done.
+- This is the git-sha of the llama.cpp versions we branched from:
+  - `615212` (git-sha-new)  , with release-tag `b4532`
+  - `b841d0` (git-sha-old)  , no   release-tag
+  - `5cdb37` (git-sha-older), no   release-tag
+
+- Start with a fresh clone of llama_cpp_onicai_fork:
+  ```bash
+  # From folder: llama_cpp_canister\src
+
+  # Copy old version, as a reference to use with meld
+  # This is just as a reference. You can remove this folder once all done.
+  # (-) Make sure the current `onicai` branch is checked out.
+  #     The one that branched off from `git-sha-old`
+  cp llama_cpp_onicai_fork llama_cpp_onicai_fork_<git-sha-old>
+
+  # Clone the new version in place
+  git clone git@github.com:onicai/llama_cpp_onicai_fork.git
+  ```
 
-- from master, create a new branch: `onicai-<git-sha>`
+- In llama_cpp_onicai_fork, from master, create a new branch: `onicai-<git-sha-new>`
 
-  For `git-sha`, use the short commit sha from which we're branching.
+  For `git-sha-new`, use the short commit sha from which we're branching.
 
 ## Update all files
 
@@ -118,63 +133,84 @@ listed in [icpp.toml](https://github.com/onicai/llama_cpp_canister/blob/main/icp
 header files.
 
 As you do your upgrade, modify the descriptions below, to help with the next upgrade:
-We use `meld` for comparing the files.
+We use `meld` for comparing the files:
+
+```bash
+brew install --cask dehesselle-meld
+```
 
 ### cpp_paths
 
 #### main_.cpp
-`meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp`
+
+```bash
+# from folder: llama_cpp_canister/src
+
+# To do the actual changes
+meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp
+
+# To check what has changed between <git-sha-new> and <git-sha-old>
+meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_<git-sha-old>/examples/main/main.cpp
+```
 - use `main_` instead of `main`
-- A few items related to console & ctrl+C need to be outcommented
+- A few items related to console, ctrl+C & threading need to be outcommented
+- Added logic for running in a canister with multiple update calls
 
 
 #### llama_cpp_onicai_fork/src/llama.cpp
+```bash
+# from folder: llama_cpp_canister/src
+# To do the actual changes
+meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama.cpp
+```
 - add `#include "ic_api.h"`
-- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format`
-- replace `throw` with `IC_API::trap`
+- replace `throw std::runtime_error` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-- outcomment threading related items:
-  - `#include <future>`
-  - `#include <mutex>`
-  - `#include <thread>`
+- outcomment threading related items
 - outcomment these functions completely:
   - `llama_tensor_quantize_internal`
   - `llama_model_quantize_internal`
 
 
 #### llama_cpp_onicai_fork/src/llama-vocab.cpp
+```bash
+# from folder: llama_cpp_canister/src
+meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama-vocab.cpp
+```
 - add `#include "ic_api.h"`
-- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format`
+- replace `throw std::runtime_error` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-- add a check on `llama_token_bos(model)`, else the llama2.c models never stop generating:
-  ```
-  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-      return token != -1 && (
-          token == llama_token_eos_impl(vocab) ||
-          token == llama_token_eot_impl(vocab) || 
-          token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos
-      );
-  }
-  ```
 
 #### llama_cpp_onicai_fork/src/llama-grammar.cpp
-No changes needed
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
 #### llama_cpp_onicai_fork/src/llama-sampling.cpp
-No changes needed
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
 
 #### llama_cpp_onicai_fork/src/unicode-data.cpp
 - no modifications needed for the IC
 
 #### llama_cpp_onicai_fork/src/unicode.cpp
 - add `#include "ic_api.h"`
-- replace `throw` with `IC_API::trap`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
-#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
+#### llama_cpp_onicai_fork/common/arg.cpp
 - add `#include "ic_api.h"`
-- replace `throw` with `IC_API::trap`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
+- return dummy values (unreachable) after each IC_API::trap
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
+#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::out_of_range` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
 #### llama_cpp_onicai_fork/common/build-info.cpp
 - run this command to create it:
@@ -182,60 +218,194 @@ No changes needed
 make build-info-cpp-wasm
 ``` 
 
-#### llama_cpp_onicai_fork/common/grammar-parser.cpp
-- add `#include "ic_api.h"`
-- replace `throw` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
 #### llama_cpp_onicai_fork/common/sampling.cpp
 - add `#include "ic_api.h"`
-- replace `throw` with `IC_API::trap`
+- replace `throw std::runtime_error` with `IC_API::trap`
 
 #### llama_cpp_onicai_fork/common/common.cpp
-- add `#include "ic_api.h"`
-- replace `throw` with `IC_API::trap`
-- outcomment all code related to `<pthread.h>`
+- add right below `#include llama.h`:
+```C++
+// ICPP-PATCH-START
+#include "ic_api.h"
+extern llama_model ** g_model; // The global variable from main_.cpp
+// ICPP-PATCH-END
+```
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 - outcomment `std::getenv`
+  Compare to changes made last time (!)
+
+- outcomment all code related to `<pthread.h>`:
+  Compare to changes made last time (!)
+  - cpu_get_num_physical_cores
+
+- outcomment #ifdef LLAMA_USE_CURL
+  Compare to changes made last time (!)
 
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
+No updates needed for icpp-pro
 
 ---
 ### c_paths
 
 #### llama_cpp_onicai_fork/ggml/src/ggml.c
-- outcomment all code related to signals
+- outcomment all code related to signals & threading
+  - `#include "ggml-threading.h"`
   - `#include <signal.h>`
-- Many threading outcomments. 
 
-#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c
-No updates needed for icpp-pro
 
-#### llama_cpp_onicai_fork/ggml/src/ggml-backend.c
+#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c
 No updates needed for icpp-pro
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-quants.c
 No updates needed for icpp-pro
 
-#### llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c
-No updates needed for icpp-pro
+#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
+- outcomment all code related to threading
 
 ---
 ### headers to modify
 
-#### llama_cpp_onicai_fork/common/log.h
-- `#include <thread>`
-- Some other threading code
-
-#### llama_cpp_onicai_fork/common/common.h
-- `#include <thread>`
+#### llama_cpp_onicai_fork/common/chat-template.hpp
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
 ## llama_cpp_onicai_fork: replace `onicai` branch
 
+TODO: RETHINK THIS LOGIC...
+(-) Perhaps it is better to keep all the `onicai-<git-sha-...>` branches
+(-) And just change the default branch to `onicai-<git-sha-new>`
+
+That way:
+(-) when someone clones, the are at the correct branch
+(-) from the name, it is immediately clear what llama.cpp version was used
+(-) we preserve the full history
+
+---
 Do NOT merge the `onicai-<git-sha>` branch into the `onicai` branch, but replace it:
 
 ```
-git branch -m onicai onicai-<old-git-sha>
-git branch -m onicai-<git-sha> onicai
+git branch -m onicai onicai-<git-sha-old>
+git branch -m onicai-<git-sha-new> onicai
 git push origin onicai:onicai
-git push origin onicai-<old-git-sha>:onicai-<old-git-sha>
-```
\ No newline at end of file
+git push origin onicai-<git-sha-old>:onicai-<git-sha-old>
+```
+
+
+------------
+TODO: search in code files for: TODO-615212
+
+(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp`
+    This is from Google, and a general chat_template, with tool calling !!!
+
+(-) main_.cpp has a new static `global g_smpl`:
+    static common_sampler          ** g_smpl;
+
+    Q: Does this need to become a global variable, accessible from common.cpp ?
+       Like we did for g_model ?
+
+       In `common/common.cpp` we added:    
+        ```
+        // ICPP-PATCH-START
+        #include "ic_api.h"
+        extern llama_model ** g_model; // The global variable from main_.cpp
+        // ICPP-PATCH-END
+        ```
+
+(-) main_.cpp renamed type for `g_params`:
+    from: static gpt_params               * g_params;
+    to  : static common_params            * g_params;
+
+    Q: Does this need to become a global variable, accessible from common.cpp ?
+       Like we did for g_model ?
+
+(-) main_.cpp line 142: common_sampler * smpl = nullptr;
+
+    Q: Does `smpl` need to become a static variable, like `model` & `ctx` ?
+
+(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going
+
+    Q: Is this logic for prompt_remaining still valid?
+
+(-) main_.cpp line 208: // ICPP-TODO-START: This section is completely new...
+    COMPLETELY NEW SECTION FOR THREADPOOLs... 
+
+(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT
+    -> LOG is used just for Console/Stream Output
+    -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes...
+
+    Q1: Did we change anything to LOG & LOG_TEE to get it to work ?
+    Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it.
+    Q3: Can we remove the LOG & LOG_TEE 
+    Q4: Do we need to update the README about downloading different LOG files?
+
+(-) main_.cpp calls common_token_to_piece instead of llama_token_to_piece
+
+    Q: Is this a new file:  common_token_to_piece
+    A: No, it is in common.cpp
+
+(-) main_.cpp calls common_tokenize instead of llama_tokenize
+
+    Q: Is this a new file:  common_tokenize
+    A: No, it is in common.cpp
+
+(-) main_.cpp line 516, 826: New sampling subsystem !
+
+    Q: Are these new files: 
+       - common_sampler_init
+       - common_sampler_sample
+       - common_sampler_accept
+    A: No, it is in sampling.cpp
+
+(-) main_.cpp line 1123: common_sampler_free(smpl)
+
+    We had outcommented code to NOT free the ctx & model storage:
+    // Do NOT free ctx & model storage
+    // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM
+    // -> we do NOT reset g_ctx & g_model
+    // -> we moved this into a free_model function, which can be called by canister's load_model
+    // llama_free(ctx);
+    // llama_free_model(model);
+
+    // TODO-615212 -- Make sure this is correct
+    // Do reset all other static memory
+    reset_static_memory();
+
+    Q1: Has this all moved into common_sampler_free ?
+
+    Q2: Update usage of the free_model function?
+
+    Q3: is reset_static_memory still correct ? 
+    
+    Q4: Is llama_sampling_free(ctx_sampling) now handled by common_sampler_free(smpl) ?
+
+
+(-) llama-vocab.cpp  --- This function is no longer there. Is tinystories still working?
+
+    We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating:
+      ```
+      bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
+          return token != -1 && (
+              token == llama_token_eos_impl(vocab) ||
+              token == llama_token_eot_impl(vocab) || 
+              token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos
+          );
+      }
+      ```
+(-) NOTE: `common/grammar-parser.cpp` is no longer there.
+          It appears to be fully included in `src/llama-grammar.cpp`
+
+(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c`
+
+(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists
+          Previous update: No updates needed for icpp-pro
+
+(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time:
+          Previous update:
+          - `#include <thread>`
+          - Some other threading code
+
+(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time:
+          Previous update:
+          - `#include <thread>`
\ No newline at end of file
diff --git a/README.md b/README.md
index 16a33a5..f373336 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ WARNING: Currently, the canister can only be build on a `Mac` !
    cd src
    git clone git@github.com:onicai/llama_cpp_onicai_fork.git
 
+TODO - DO WE STILL NEED THIS???
    # Initialize the submodules of the llama_cpp_onicai_fork repo
    cd llama_cpp_onicai_fork
    git submodule init
@@ -203,7 +204,7 @@ WARNING: Currently, the canister can only be build on a `Mac` !
 
     # Remove the prompt cache when done - this keeps stable memory usage at a minimum
     dfx canister call llama_cpp remove_prompt_cache '(record { args = vec {"--prompt-cache"; "prompt.cache"} })'
-    
+
     ```
 
     Note: The sequence of update calls to the canister is required because the Internet Computer has a limitation
diff --git a/icpp.toml b/icpp.toml
index 09b11e7..2e42b58 100644
--- a/icpp.toml
+++ b/icpp.toml
@@ -10,9 +10,10 @@ cpp_paths = [
     "src/llama_cpp_onicai_fork/src/unicode.cpp",
     "src/llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp",
     "src/llama_cpp_onicai_fork/common/build-info.cpp",
-    "src/llama_cpp_onicai_fork/common/grammar-parser.cpp",
     "src/llama_cpp_onicai_fork/common/sampling.cpp",
     "src/llama_cpp_onicai_fork/common/common.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp",
     "src/*.cpp",
 ]
 cpp_include_dirs = [
@@ -20,6 +21,7 @@ cpp_include_dirs = [
     "src/llama_cpp_onicai_fork/include",
     "src/llama_cpp_onicai_fork/src",
     "src/llama_cpp_onicai_fork/ggml/include",
+    "src/llama_cpp_onicai_fork/ggml/src",
     "src/llama_cpp_onicai_fork/common",
 ]
 # NOTE: Adding compile flag "-msimd128" might be too much. It will compile everything with simd
@@ -33,9 +35,7 @@ cpp_link_flags = []
 c_paths = [
     "src/llama_cpp_onicai_fork/ggml/src/ggml.c",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-alloc.c",
-    "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.c",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-quants.c",
-    "src/llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c",
 ]
 c_include_dirs = [
     "src/llama_cpp_onicai_fork",
diff --git a/src/main_.cpp b/src/main_.cpp
index ae3b24d..f7e7f6a 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -361,10 +361,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     // Should not run without any tokens
     if (embd_inp.empty()) {
         if (add_bos) {
-            embd_inp.push_back(llama_token_bos(model));
-            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+            embd_inp.push_back(llama_vocab_bos(vocab));
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
         } else {
-            LOG_TEE("error: input is empty\n");
+            LOG_ERR("input is empty\n");
             return -1;
         }
     }
@@ -719,7 +719,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
                     // ICPP-PATCH-START
                     // Keep track of the processed conversation tokens and the remaining prompt
                     int id = embd[i];
-                    const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                    const std::string token_str = common_token_to_piece(ctx, id, params.special);
                     conversation_ss << token_str;
 
                     // if (prompt_remaining.find(token_str) == 0) {
@@ -1164,10 +1164,11 @@ void free_ctx() {
 }
 
 // TODO-615212 -- Make sure this is correct
+//                llama_model_free is a replacement for llama_free_model
 //                LEAVE IT IN
 void free_model() {
     if (g_model && *g_model) {
-        llama_free_model(*g_model);
+        llama_model_free(*g_model);
         *g_model = nullptr;
         g_model = nullptr;
     }
diff --git a/src/run.cpp b/src/run.cpp
index dda1c0e..36e4c4a 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -7,6 +7,8 @@
 #include "max_tokens.h"
 #include "utils.h"
 
+#include "arg.h"
+
 #include <filesystem>
 #include <iostream>
 #include <string>
@@ -26,6 +28,11 @@
   (-) run_update
 */
 
+
+static void print_usage(int argc, char ** argv) {
+  // do nothing function
+}
+
 void new_chat() {
   IC_API ic_api(CanisterUpdate{std::string(__func__)}, false);
   std::string error_msg;
@@ -43,8 +50,8 @@ void new_chat() {
 
   // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation
   // Get the cache filename from --prompt-cache in args
-  gpt_params params;
-  if (!gpt_params_parse(argc, argv.data(), params)) {
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) {
     error_msg = "Cannot parse args.";
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);
@@ -131,8 +138,8 @@ void remove_prompt_cache() {
   auto [argc, argv, args] = get_args_for_main(ic_api);
 
   // Get the cache filename from --prompt-cache in args
-  gpt_params params;
-  if (!gpt_params_parse(argc, argv.data(), params)) {
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) {
     error_msg = "Cannot parse args.";
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);

From 5cd89f3da15f73ce5d54b8f21c15f936d098cc58 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Thu, 23 Jan 2025 21:01:27 -0500
Subject: [PATCH 03/25] format

---
 src/run.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/run.cpp b/src/run.cpp
index 36e4c4a..a30991d 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -28,8 +28,7 @@
   (-) run_update
 */
 
-
-static void print_usage(int argc, char ** argv) {
+static void print_usage(int argc, char **argv) {
   // do nothing function
 }
 
@@ -51,7 +50,8 @@ void new_chat() {
   // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation
   // Get the cache filename from --prompt-cache in args
   common_params params;
-  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
     error_msg = "Cannot parse args.";
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);
@@ -139,7 +139,8 @@ void remove_prompt_cache() {
 
   // Get the cache filename from --prompt-cache in args
   common_params params;
-  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
     error_msg = "Cannot parse args.";
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);

From 5ea6d68d651b6a7e8f0fbe47de7a2f4eede27a93 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Fri, 24 Jan 2025 12:45:13 -0500
Subject: [PATCH 04/25] mockic.exe builds !!

---
 README-contributors-guide.md | 69 +++++++++++++++++++++++++++++++++---
 icpp.toml                    | 15 ++++++++
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 0c5f74e..0cf06d1 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -190,6 +190,16 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
 - add `#include "ic_api.h"`
 - replace `throw std::runtime_error` with `IC_API::trap`
 
+#### llama_cpp_onicai_fork/src/llama-impl.cpp
+- no modifications needed for the IC
+
+#### src/llama_cpp_onicai_fork/src/llama-context.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### src/llama_cpp_onicai_fork/src/llama-arch.cpp
+- no modifications needed for the IC
+
 #### llama_cpp_onicai_fork/src/unicode-data.cpp
 - no modifications needed for the IC
 
@@ -199,11 +209,40 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
 - replace `throw std::invalid_argument` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
+#### llama_cpp_onicai_fork/src/llama-kv-cache.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/llama-chat.cpp
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-mmap.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/llama-model.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-batch.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/llama-adapter.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-model-loader.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/llama-hparams.cpp
+- no modifications needed for the IC
+
 #### llama_cpp_onicai_fork/common/arg.cpp
 - add `#include "ic_api.h"`
 - replace `throw std::runtime_error` with `IC_API::trap`
 - replace `throw std::invalid_argument` with `IC_API::trap`
-- return dummy values (unreachable) after each IC_API::trap
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
 #### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
@@ -243,6 +282,11 @@ extern llama_model ** g_model; // The global variable from main_.cpp
 - outcomment #ifdef LLAMA_USE_CURL
   Compare to changes made last time (!)
 
+#### llama_cpp_onicai_fork/common/log.cpp
+- Remove all threading logic
+  #include <mutex>
+  #include <thread>
+
 #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
 No updates needed for icpp-pro
 
@@ -264,6 +308,12 @@ No updates needed for icpp-pro
 #### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
 - outcomment all code related to threading
 
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/gguf.cpp
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
 ---
 ### headers to modify
 
@@ -296,9 +346,6 @@ git push origin onicai-<git-sha-old>:onicai-<git-sha-old>
 ------------
 TODO: search in code files for: TODO-615212
 
-(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp`
-    This is from Google, and a general chat_template, with tool calling !!!
-
 (-) main_.cpp has a new static `global g_smpl`:
     static common_sampler          ** g_smpl;
 
@@ -393,6 +440,20 @@ TODO: search in code files for: TODO-615212
           );
       }
       ```
+
+(-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic
+          - verify the outcommented logic makes sense, or if we should just
+            completely remove the pause() & resume() functions.
+
+----------------------------------------------------------
+NOTES:
+
+(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp`
+    This is from Google, and a general chat_template, with tool calling !!!
+
+(-) All the LLM architectures supported by llama_cpp_canister are listed in 
+    `src/llama_cpp_onicai_fork/src/llama-arch.cpp`
+
 (-) NOTE: `common/grammar-parser.cpp` is no longer there.
           It appears to be fully included in `src/llama-grammar.cpp`
 
diff --git a/icpp.toml b/icpp.toml
index 2e42b58..b52633a 100644
--- a/icpp.toml
+++ b/icpp.toml
@@ -6,14 +6,29 @@ cpp_paths = [
     "src/llama_cpp_onicai_fork/src/llama-vocab.cpp",
     "src/llama_cpp_onicai_fork/src/llama-grammar.cpp",
     "src/llama_cpp_onicai_fork/src/llama-sampling.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-impl.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-context.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-arch.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-kv-cache.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-chat.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-mmap.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-model.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-batch.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-adapter.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-model-loader.cpp",
+    "src/llama_cpp_onicai_fork/src/llama-hparams.cpp",
     "src/llama_cpp_onicai_fork/src/unicode-data.cpp",
     "src/llama_cpp_onicai_fork/src/unicode.cpp",
+    "src/llama_cpp_onicai_fork/common/arg.cpp",
     "src/llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp",
     "src/llama_cpp_onicai_fork/common/build-info.cpp",
     "src/llama_cpp_onicai_fork/common/sampling.cpp",
     "src/llama_cpp_onicai_fork/common/common.cpp",
+    "src/llama_cpp_onicai_fork/common/log.cpp",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/gguf.cpp",
     "src/*.cpp",
 ]
 cpp_include_dirs = [

From cc8123f66f14f095348c30aaf70742f30a9c21bb Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sat, 25 Jan 2025 06:30:15 -0500
Subject: [PATCH 05/25] register CPU backend

---
 README-contributors-guide.md | 31 +++++++++++++++++++++++--------
 icpp.toml                    | 18 ++++++++++++------
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 0cf06d1..d573c19 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -290,6 +290,21 @@ extern llama_model ** g_model; // The global variable from main_.cpp
 #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
 No updates needed for icpp-pro
 
+#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
+- outcomment all code related to threading
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/gguf.cpp
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+No updates needed for icpp-pro
+
 ---
 ### c_paths
 
@@ -305,14 +320,11 @@ No updates needed for icpp-pro
 #### llama_cpp_onicai_fork/ggml/src/ggml-quants.c
 No updates needed for icpp-pro
 
-#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
-- outcomment all code related to threading
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c
 No updates needed for icpp-pro
 
-#### llama_cpp_onicai_fork/ggml/src/gguf.cpp
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c
+No updates needed for icpp-pro
 
 ---
 ### headers to modify
@@ -442,9 +454,12 @@ TODO: search in code files for: TODO-615212
       ```
 
 (-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic
-          - verify the outcommented logic makes sense, or if we should just
-            completely remove the pause() & resume() functions.
+          - Remove the pause() function
+          - Remove the cur.is_end function ?
 
+(-) Monitor memory, and make sure that ctx is freed up...
+    See free_ctx() method that has been outcommented in main_.cpp
+    
 ----------------------------------------------------------
 NOTES:
 
diff --git a/icpp.toml b/icpp.toml
index b52633a..6daeacc 100644
--- a/icpp.toml
+++ b/icpp.toml
@@ -29,6 +29,8 @@ cpp_paths = [
     "src/llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp",
     "src/llama_cpp_onicai_fork/ggml/src/gguf.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
     "src/*.cpp",
 ]
 cpp_include_dirs = [
@@ -38,6 +40,7 @@ cpp_include_dirs = [
     "src/llama_cpp_onicai_fork/ggml/include",
     "src/llama_cpp_onicai_fork/ggml/src",
     "src/llama_cpp_onicai_fork/common",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu",
 ]
 # NOTE: Adding compile flag "-msimd128" might be too much. It will compile everything with simd
 #       Alternative is to add it at granular level in the code, like:
@@ -45,21 +48,24 @@ cpp_include_dirs = [
 #       void __attribute__((target("simd128"))) simd_function() {
 #           // SIMD-specific code here
 #       }
-cpp_compile_flags = ["-DNDEBUG"]
+cpp_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"]
 cpp_link_flags = []
 c_paths = [
     "src/llama_cpp_onicai_fork/ggml/src/ggml.c",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-alloc.c",
     "src/llama_cpp_onicai_fork/ggml/src/ggml-quants.c",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c",
 ]
 c_include_dirs = [
     "src/llama_cpp_onicai_fork",
     "src/llama_cpp_onicai_fork/include",
-    "src/llama_cpp_onicai_fork/ggml/src",
-    "src/llama_cpp_onicai_fork/ggml/include",
     "src/llama_cpp_onicai_fork/common",
+    "src/llama_cpp_onicai_fork/ggml/include",
+    "src/llama_cpp_onicai_fork/ggml/src",
+    "src/llama_cpp_onicai_fork/ggml/src/ggml-cpu",
 ]
-c_compile_flags = ["-DNDEBUG", "-msimd128"]
+c_compile_flags = ["-DNDEBUG", "-msimd128", "-DGGML_USE_CPU"]
 post_wasm_function = "scripts.optimize_wasm.main"
 
 [build-native]
@@ -72,8 +78,8 @@ cpp_paths = [
     # "src/llama_cpp_onicai_fork/common/console.cpp",
 ]
 cpp_include_dirs = []
-cpp_compile_flags = ["-DNDEBUG"]
+cpp_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"]
 cpp_link_flags = []
 c_paths = []
 c_include_dirs = []
-c_compile_flags = ["-DNDEBUG"]
+c_compile_flags = ["-DNDEBUG", "-DGGML_USE_CPU"]

From 47d0a555653fd355c1bcfc9d14b96c931a6909ab Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Mon, 27 Jan 2025 11:28:38 -0500
Subject: [PATCH 06/25] Update memory management for model into Orthogonal
 Persistence

---
 README-contributors-guide.md | 50 +++++++++++++++++++---
 src/main_.cpp                | 81 ++++++++++++++----------------------
 src/main_.h                  |  4 +-
 src/model.cpp                | 45 ++++++++++++++------
 src/run.cpp                  | 14 +++++++
 5 files changed, 124 insertions(+), 70 deletions(-)

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index d573c19..171eaf9 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -263,12 +263,40 @@ make build-info-cpp-wasm
 
 #### llama_cpp_onicai_fork/common/common.cpp
 - add right below `#include llama.h`:
-```C++
-// ICPP-PATCH-START
-#include "ic_api.h"
-extern llama_model ** g_model; // The global variable from main_.cpp
-// ICPP-PATCH-END
-```
+  ```C++
+    // ICPP-PATCH-START
+    #include "ic_api.h"
+    extern llama_model ** g_model; // The global variable from main_.cpp
+    // ICPP-PATCH-END
+  ```
+- In common_init_result, skip loading the model if the --model parameter is not provided:
+  ```C++
+    // ICPP-PATCH-START
+    // Skip loading the model if the --model parameter is not provided
+    if (!params.model.empty()) {
+    // ICPP-PATCH-END
+    
+    ... 
+    model = ...
+    ...
+
+    // ICPP-PATCH-START
+    // Skip loading the model if the --model parameter is not provided
+    } else {
+        // Access the model through g_model and assign it to the local variable
+        model = *g_model;
+    }
+    // ICPP-PATCH-END
+  ```
+- In common_init_result, do NOT transfer ownership of the model pointer:
+  ```C++
+    // ICPP-PATCH-START: 
+    // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model
+    // We do NOT want the model to be freed when the unique_ptr goes out of scope
+    // iparams.model.reset(model);
+    // ICPP-PATCH-END
+  ```
+
 - replace `throw std::runtime_error` with `IC_API::trap`
 - replace `throw std::invalid_argument` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
@@ -329,6 +357,16 @@ No updates needed for icpp-pro
 ---
 ### headers to modify
 
+#### llama_cpp_onicai_fork/common/common.h
+- Modify this:
+```
+// ICPP-PATCH-START
+// We do NOT load a default model into the canister
+// #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+#define DEFAULT_MODEL_PATH ""
+// ICPP-PATCH-END
+```
+
 #### llama_cpp_onicai_fork/common/chat-template.hpp
 - replace `throw std::runtime_error` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
diff --git a/src/main_.cpp b/src/main_.cpp
index f7e7f6a..1b87dca 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -156,7 +156,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     llama_numa_init(params.numa);
 
     static llama_model * model; // ICPP-PATCH: use static to preserve accross calls
-    static llama_context * ctx; // ICPP-PATCH: use static to preserve accross calls
+    llama_context * ctx;
     common_sampler * smpl = nullptr;
 
     // ICPP-PATCH-START
@@ -166,14 +166,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     // Keep track of the prompt portion not yet processed
     prompt_remaining.clear();
 
-    // Skip loading the model if the --model parameter is not provided
-    // if (!params.model.empty()) {  // TODO: REMOVE THIS: WE MOVED THIS CHECK INTO llama_init_from_gpt_params
-    free_ctx();
-    if (!params.model.empty()) {
-        free_model();
-    }
-    // ICPP-PATCH-END
-
     g_model = &model;
     g_ctx = &ctx;
     g_smpl = &smpl;
@@ -194,17 +186,23 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
         // ICPP-PATCH-END
         return 1;
     }
+    
     // ICPP-PATCH-START
-    // Skip loading the model if the --model parameter is not provided
-    // } // TODO: REMOVE THIS: WE MOVED THIS CHECK INTO llama_init_from_gpt_params
     
-    // And return if we are asked to ONLY load the model
+    // Transfer the ownership of the model pointer. so it persists across calls in Orthogonal Persistence.
+    // We manually take control over the memory management of the model pointer, using icpp_free_model() to free it.
+    // NOTE: The release() method of std::unique_ptr relinquishes ownership of the managed 
+    //       object and returns the raw pointer to it. 
+    //       After the call to release(), the std::unique_ptr becomes empty 
+    //       (i.e., it no longer manages any object).
+    model = llama_init.model.release();
+    
+    // Return if we are asked to ONLY load the model
     if (load_model_only) {
         return 0;
     }
     // ICPP-PATCH-END
 
-    // ICPP-TODO-START: This section is completely new...
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
 
@@ -240,7 +238,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     }
 
     llama_attach_threadpool(ctx, threadpool, threadpool_batch);
-    // ICPP-TODO-END
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
@@ -1124,24 +1121,12 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 
     // ICPP-PATCH-START
 
-    // TODO-615212 -- This is old code that we had outcommented
-    //                REMOVE
-    // Do NOT free ctx & model storage
-    // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM
-    // -> we do NOT reset g_ctx & g_model
-    // -> we moved this into a free_model function, which can be called by canister's load_model
-    // llama_free(ctx);
-    // llama_free_model(model);
-
     // TODO-615212 -- Make sure this is correct
     //                LEAVE IT IN
     // Do reset all other static memory
     reset_static_memory();
     // ICPP-PATCH-END
 
-    // TODO-615212 -- Make sure this is now handled in common_sampler_free
-    //                REMOVE
-    // llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
     ggml_threadpool_free_fn(threadpool);
@@ -1153,42 +1138,40 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
 // ICPP-PATCH-START: 
 // functions added for running on IC
 
-// TODO-615212 -- Make sure this is now handled in common_sampler_free
-//                REMOVE
-void free_ctx() {
-    if (g_ctx && *g_ctx) {
-        llama_free(*g_ctx);
-        *g_ctx = nullptr;
-        g_ctx = nullptr;
-    }
-}
-
-// TODO-615212 -- Make sure this is correct
-//                llama_model_free is a replacement for llama_free_model
-//                LEAVE IT IN
-void free_model() {
+// Function to be called by the canister to free the model which is persisted in Orthogonal Persisted memory
+void icpp_free_model() {
     if (g_model && *g_model) {
         llama_model_free(*g_model);
         *g_model = nullptr;
         g_model = nullptr;
     }
 }
-// TODO-615212 -- Make sure this is correct
-//                LEAVE IT IN
+
 void reset_static_memory() {
-    // Tip: to find what must be reset, use a native debug build and stop here 
-    //      in vscode. Then check the static memory section in VARIABLES.
+    /* Tip: to find what must be reset, use a native debug build and stop here 
+            in lldb:
+        
+        lldb ./build-native/mockic.exe
+        (lldb) breakpoint set --name reset_static_memory
+        (lldb) run
+        (lldb) target variable
+    */
 
     // Avoid dangling pointers in static memory
     // -> The data pointed to is re-created each call
-    // -> The data pointed to is cleared automatic, because it is non-static
-    g_output_tokens = nullptr;
+    // -> The data pointed to is cleared automatic, because:
+    //    (-) it is a smart pointer (std::unique_ptr)
+    //    (-) it is non-static
+    
+    g_ctx = nullptr;
+    g_smpl = nullptr;
     g_params = nullptr;
-    g_input_tokens = nullptr;
     g_output_ss = nullptr;
-
+    g_output_tokens = nullptr;
+    g_input_tokens = nullptr;
+    
     // Do not carry over any other values in static memory
-    is_interacting = false;
     need_insert_eot = false;
+    is_interacting = false;
 }
 // ICPP-PATCH-END
diff --git a/src/main_.h b/src/main_.h
index bb36c28..03e4494 100644
--- a/src/main_.h
+++ b/src/main_.h
@@ -6,6 +6,6 @@ int main_(int argc, char **argv, std::string principal_id, bool load_model_only,
           std::string &icpp_error_msg, std::ostringstream &conversation_ss,
           std::ostringstream &output_ss, const uint64_t &max_tokens,
           std::string &prompt_remaining, bool &generated_eog);
-void free_ctx();
-void free_model();
+
+void icpp_free_model();
 void reset_static_memory();
\ No newline at end of file
diff --git a/src/model.cpp b/src/model.cpp
index 88bf04c..db259ee 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -9,11 +9,18 @@
 #include "upload.h"
 #include "utils.h"
 
+#include "common.h"
+#include "arg.h"
+
 #include <iostream>
 #include <string>
 
 #include "ic_api.h"
 
+static void print_usage(int argc, char **argv) {
+  // do nothing function
+}
+
 void load_model() {
   IC_API ic_api(CanisterUpdate{std::string(__func__)}, false);
   if (!is_caller_a_controller(ic_api)) return;
@@ -21,14 +28,33 @@ void load_model() {
   CandidTypePrincipal caller = ic_api.get_caller();
   std::string principal_id = caller.get_text();
 
+  std::string error_msg;
+
   // Get the data from the wire and prepare arguments for main_
   auto [argc, argv, args] = get_args_for_main(ic_api);
 
-  // Lets go.
-  ready_for_inference = true;
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
+    error_msg = "Cannot parse args.";
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, error_msg);
+    return;
+  }
+
+  if (!params.model.empty()) {
+    // We're going to load a new model, first free the Orthogonally Persisted memory of a previously loaded model
+    icpp_free_model();
+  } else {
+    error_msg = "--model not provided in args. Do not know what model to load.";
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, error_msg);
+    return;
+  }
+
 
-  // First free the OP memory of a previously loaded model
-  free_model();
+  // First free the Orthogonally Persisted memory of a previously loaded model
+  icpp_free_model();
 
   // Call main_, just like it is called in the llama-cli app
   std::string icpp_error_msg;
@@ -43,15 +69,8 @@ void load_model() {
 
   // Exit if there was an error
   if (result != 0) {
-    CandidTypeRecord r_out;
-    r_out.append("status_code",
-                 CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500
-    r_out.append("conversation", CandidTypeText{""});
-    r_out.append("output", CandidTypeText{""});
-    r_out.append("error", CandidTypeText{icpp_error_msg});
-    r_out.append("prompt_remaining", CandidTypeText{""});
-    r_out.append("generated_eog", CandidTypeBool{generated_eog});
-    ic_api.to_wire(CandidTypeVariant{"Err", r_out});
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, icpp_error_msg);
     return;
   }
 
diff --git a/src/run.cpp b/src/run.cpp
index a30991d..25e0e14 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -222,6 +222,20 @@ void run(IC_API &ic_api, const uint64_t &max_tokens) {
   // Get the data from the wire and prepare arguments for main_
   auto [argc, argv, args] = get_args_for_main(ic_api);
 
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
+    error_msg = "Cannot parse args.";
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, error_msg);
+    return;
+  }
+
+  // If we're going to load a new model, first free the Orthogonally Persisted memory of a previously loaded model
+  if (!params.model.empty()) {
+    icpp_free_model();
+  }
+  
   // Call main_, just like it is called in the llama-cli app
   std::string icpp_error_msg;
   std::ostringstream

From 23bbf40bcf1fa9de85a2580cf8738c8921220261 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Mon, 27 Jan 2025 11:28:55 -0500
Subject: [PATCH 07/25] Format

---
 src/model.cpp | 3 +--
 src/run.cpp   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/model.cpp b/src/model.cpp
index db259ee..540c047 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -9,8 +9,8 @@
 #include "upload.h"
 #include "utils.h"
 
-#include "common.h"
 #include "arg.h"
+#include "common.h"
 
 #include <iostream>
 #include <string>
@@ -52,7 +52,6 @@ void load_model() {
     return;
   }
 
-
   // First free the Orthogonally Persisted memory of a previously loaded model
   icpp_free_model();
 
diff --git a/src/run.cpp b/src/run.cpp
index 25e0e14..79fa8d8 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -235,7 +235,7 @@ void run(IC_API &ic_api, const uint64_t &max_tokens) {
   if (!params.model.empty()) {
     icpp_free_model();
   }
-  
+
   // Call main_, just like it is called in the llama-cli app
   std::string icpp_error_msg;
   std::ostringstream

From 7b79e6f0179e4911f163c2cb4fd0165f5c95b6eb Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Mon, 27 Jan 2025 12:48:49 -0500
Subject: [PATCH 08/25] Default behavior: -no-cnv

---
 src/main_.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/main_.cpp b/src/main_.cpp
index 1b87dca..e7c97ba 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -250,8 +250,13 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
     if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
         if (has_chat_template) {
-            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+            // ICPP-PATCH-START
+            // conversation mode is not supported in a canister. Do not turn it on by default.
+            // LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            // params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+            LOG_INF("%s: chat template is available, but since canisters do not support conversation mode, we use -no-cnv by default.)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+            // ICPP-PATCH-END
         } else {
             params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
         }

From 528b5eda86d9c2cc8e5553e408762d96cfbff5b3 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Mon, 27 Jan 2025 12:49:53 -0500
Subject: [PATCH 09/25] Upgrade to icpp-pro 5.0.2

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4af6c53..cf4595d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,6 @@
 
 -r scripts/requirements.txt
 -r src/llama_cpp_onicai_fork/requirements.txt
-icpp-pro>=5.0.1
+icpp-pro>=5.0.2
 ic-py==1.0.1
 binaryen.py
\ No newline at end of file

From 2be9f53249cf9d90efc8afe328a2e79f5931e138 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Tue, 28 Jan 2025 06:36:24 -0500
Subject: [PATCH 10/25] wasm now builds

---
 README-contributors-guide.md | 76 ++++++++++++++++++++++++++++++++++--
 README.md                    |  6 ---
 src/main_.cpp                |  5 ++-
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 171eaf9..48d98af 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -235,6 +235,14 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
 #### llama_cpp_onicai_fork/src/llama-model-loader.cpp
 - add `#include "ic_api.h"`
 - replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment all uses of `validation_result`:
+  ```C++
+    // ICPP-PATCH-START
+    // we do not support check_tensors. It requires threading.
+    // std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
+    // ICPP-PATCH-END
+    ... several other references to validation_result
+  ```
 
 #### llama_cpp_onicai_fork/src/llama-hparams.cpp
 - no modifications needed for the IC
@@ -244,6 +252,11 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
 - replace `throw std::runtime_error` with `IC_API::trap`
 - replace `throw std::invalid_argument` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment args that require `std::thread`
+- outcomment call to `ggml_backend_load_all();`
+  We are not loading the dynamic backends, because it is calling dlopen which results in
+  undefined symbols during linking.
+  We can skip it, because we already registered the CPU backend as a compile flag.
 
 #### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
 - add `#include "ic_api.h"`
@@ -275,7 +288,7 @@ make build-info-cpp-wasm
     // Skip loading the model if the --model parameter is not provided
     if (!params.model.empty()) {
     // ICPP-PATCH-END
-    
+
     ... 
     model = ...
     ...
@@ -296,7 +309,6 @@ make build-info-cpp-wasm
     // iparams.model.reset(model);
     // ICPP-PATCH-END
   ```
-
 - replace `throw std::runtime_error` with `IC_API::trap`
 - replace `throw std::invalid_argument` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
@@ -310,6 +322,8 @@ make build-info-cpp-wasm
 - outcomment #ifdef LLAMA_USE_CURL
   Compare to changes made last time (!)
 
+- outcomment `set_process_priority` function
+
 #### llama_cpp_onicai_fork/common/log.cpp
 - Remove all threading logic
   #include <mutex>
@@ -322,13 +336,30 @@ No updates needed for icpp-pro
 - outcomment all code related to threading
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
-No updates needed for icpp-pro
+- Update dl_handle_deleter, to avoid a call to dlclose that should never happen
+  The linker ends up with undefined if we don't outcomment it
+  ```C++
+  #include "ic_api.h"
+  struct dl_handle_deleter {
+    void operator()(void * handle) {
+        // ICPP-PATCH-START
+        // We are NOT dynamically loading any backend
+        // SO WE SHOULD NEVER GET HERE
+        // Avoid linker error by outcommenting this, but inserting a runtime trap
+        // dlclose(handle);
+        IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called");
+        // ICPP-PATCH-END
+      }
+  };
+  ```
 
 #### llama_cpp_onicai_fork/ggml/src/gguf.cpp
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp
-No updates needed for icpp-pro
+- outcomment all code related to signals & threading:
+  - `#include "ggml-threading.h"`
+  - `#include <signal.h>`
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
 No updates needed for icpp-pro
@@ -357,6 +388,40 @@ No updates needed for icpp-pro
 ---
 ### headers to modify
 
+#### llama_cpp_onicai_fork/src/llama-model-loader.h
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/minja.hpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- re-define two functions:
+  ```C++
+    // ICPP-PATCH-START
+    // throw not supported, using IC_API::trap instead, which expects a string
+    // std::runtime_error unexpected(const TemplateToken & token) const {
+    //   return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type)
+    //     + error_location_suffix(*template_str, token.location.pos));
+    // }
+    // std::runtime_error unterminated(const TemplateToken & token) const {
+    //   return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type)
+    //     + error_location_suffix(*template_str, token.location.pos));
+    // }
+    std::string unexpected(const TemplateToken & token) const {
+      return ("Unexpected " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+    std::string unterminated(const TemplateToken & token) const {
+      return ("Unterminated " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+    // ICPP-PATCH-END
+  ```
+- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))`
+- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))`
+- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))`
+- outcomment try-catch
+
 #### llama_cpp_onicai_fork/common/common.h
 - Modify this:
 ```
@@ -371,6 +436,9 @@ No updates needed for icpp-pro
 - replace `throw std::runtime_error` with `IC_API::trap`
 - outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
 
+#### llama_cpp_onicai_fork/ggml/include/ggml.h
+- #define GGML_DEFAULT_N_THREADS  1
+
 ## llama_cpp_onicai_fork: replace `onicai` branch
 
 TODO: RETHINK THIS LOGIC...
diff --git a/README.md b/README.md
index f373336..bcd0397 100644
--- a/README.md
+++ b/README.md
@@ -46,12 +46,6 @@ WARNING: Currently, the canister can only be build on a `Mac` !
    # Into the ./src folder
    cd src
    git clone git@github.com:onicai/llama_cpp_onicai_fork.git
-
-TODO - DO WE STILL NEED THIS???
-   # Initialize the submodules of the llama_cpp_onicai_fork repo
-   cd llama_cpp_onicai_fork
-   git submodule init
-   git submodule update
    ```
 
 - Create the file src/llama_cpp_onicai_fork/common/build-info.cpp
diff --git a/src/main_.cpp b/src/main_.cpp
index e7c97ba..45815d7 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -217,7 +217,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    set_process_priority(params.cpuparams.priority);
+    // ICPP-PATCH-START
+    // This is not supported in a canister
+    // set_process_priority(params.cpuparams.priority);
+    // ICPP-PATCH-END
 
     struct ggml_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {

From d5a48a697b1ff891d15c58d3eee615805e13d914 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Tue, 28 Jan 2025 06:42:33 -0500
Subject: [PATCH 11/25] free model only once

---
 src/model.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/model.cpp b/src/model.cpp
index 540c047..ddc9ec1 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -52,9 +52,6 @@ void load_model() {
     return;
   }
 
-  // First free the Orthogonally Persisted memory of a previously loaded model
-  icpp_free_model();
-
   // Call main_, just like it is called in the llama-cli app
   std::string icpp_error_msg;
   std::ostringstream conversation_ss;

From 063b605b32198e22937c6c82d6f1af21e515de8e Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Tue, 28 Jan 2025 09:51:20 -0500
Subject: [PATCH 12/25] Scripts to build & deploy

---
 dfx.json                      | 10 +++++
 scripts/0-all.sh              |  7 +++
 scripts/1-build.sh            |  8 ++++
 scripts/2-deploy-reinstall.sh | 77 ++++++++++++++++++++++++++++++++
 scripts/2-deploy-upgrade.sh   | 77 ++++++++++++++++++++++++++++++++
 scripts/3-upload-model.sh     | 82 ++++++++++++++++++++++++++++++++++
 scripts/4-load-model.sh       | 78 ++++++++++++++++++++++++++++++++
 scripts/5-set-max-tokens.sh   | 84 +++++++++++++++++++++++++++++++++++
 8 files changed, 423 insertions(+)
 create mode 100755 scripts/0-all.sh
 create mode 100755 scripts/1-build.sh
 create mode 100755 scripts/2-deploy-reinstall.sh
 create mode 100755 scripts/2-deploy-upgrade.sh
 create mode 100755 scripts/3-upload-model.sh
 create mode 100755 scripts/4-load-model.sh
 create mode 100755 scripts/5-set-max-tokens.sh

diff --git a/dfx.json b/dfx.json
index 03999f0..dddeeb0 100644
--- a/dfx.json
+++ b/dfx.json
@@ -5,6 +5,16 @@
       "type": "custom",
       "candid": "src/llama_cpp.did",
       "wasm": "build/llama_cpp.wasm"
+    },
+    "llm_0": {
+      "type": "custom",
+      "candid": "src/llama_cpp.did",
+      "wasm": "build/llama_cpp.wasm"
+    },
+    "llm_1": {
+      "type": "custom",
+      "candid": "src/llama_cpp.did",
+      "wasm": "build/llama_cpp.wasm"
     }
   },
   "defaults": {
diff --git a/scripts/0-all.sh b/scripts/0-all.sh
new file mode 100755
index 0000000..76cb641
--- /dev/null
+++ b/scripts/0-all.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+scripts/1-build.sh
+scripts/2-deploy-reinstall.sh
+scripts/3-upload-model.sh
+scripts/4-load-model.sh
+scripts/5-set-max-tokens.sh
\ No newline at end of file
diff --git a/scripts/1-build.sh b/scripts/1-build.sh
new file mode 100755
index 0000000..00b377c
--- /dev/null
+++ b/scripts/1-build.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+echo " "
+echo "--------------------------------------------------"
+echo "Building the wasm for llama_cpp_canister"
+make build-info-cpp-wasm
+# icpp build-wasm
+icpp build-wasm --to-compile mine-no-lib
\ No newline at end of file
diff --git a/scripts/2-deploy-reinstall.sh b/scripts/2-deploy-reinstall.sh
new file mode 100755
index 0000000..926062f
--- /dev/null
+++ b/scripts/2-deploy-reinstall.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/deploy-reinstall.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+NUM_LLMS_DEPLOYED=1
+
+# When deploying to IC, we deploy to a specific subnet
+# none will not use subnet parameter in deploy to ic
+SUBNET="none"
+# SUBNET="-------"
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+#######################################################################
+echo " "
+echo "==================================================="
+echo "Deploying $NUM_LLMS_DEPLOYED llms to subnet $SUBNET"
+llm_id_start=0
+llm_id_end=$((NUM_LLMS_DEPLOYED - 1))
+
+for i in $(seq $llm_id_start $llm_id_end)
+do
+    echo "--------------------------------------------------"
+    echo "Deploying the wasm to llm_$i"
+    if [ "$NETWORK_TYPE" = "ic" ]; then
+        if [ "$SUBNET" = "none" ]; then
+            yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE
+        else
+            yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE --subnet $SUBNET
+        fi
+    else
+        yes | dfx deploy llm_$i -m reinstall --yes --network $NETWORK_TYPE
+    fi 
+    
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Checking health endpoint for llm_$i"
+    output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i health check failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i health check failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i health check succeeded."
+        echo 🎉
+    fi
+done
\ No newline at end of file
diff --git a/scripts/2-deploy-upgrade.sh b/scripts/2-deploy-upgrade.sh
new file mode 100755
index 0000000..9c3aec0
--- /dev/null
+++ b/scripts/2-deploy-upgrade.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/deploy-upgrade.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+NUM_LLMS_DEPLOYED=1
+
+# When deploying to IC, we deploy to a specific subnet
+# none will not use subnet parameter in deploy to ic
+SUBNET="none"
+# SUBNET="-------"
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+#######################################################################
+echo " "
+echo "==================================================="
+echo "Deploying $NUM_LLMS_DEPLOYED llms to subnet $SUBNET"
+llm_id_start=0
+llm_id_end=$((NUM_LLMS_DEPLOYED - 1))
+
+for i in $(seq $llm_id_start $llm_id_end)
+do
+    echo "--------------------------------------------------"
+    echo "Deploying the wasm to llm_$i"
+    if [ "$NETWORK_TYPE" = "ic" ]; then
+        if [ "$SUBNET" = "none" ]; then
+            yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE
+        else
+            yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE --subnet $SUBNET
+        fi
+    else
+        yes | dfx deploy llm_$i --yes --network $NETWORK_TYPE
+    fi 
+    
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Checking health endpoint for llm_$i"
+    output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i health check failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i health check failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i health check succeeded."
+        echo 🎉
+    fi
+done
\ No newline at end of file
diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh
new file mode 100755
index 0000000..1a46c0a
--- /dev/null
+++ b/scripts/3-upload-model.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+LLAMA_CPP_CANISTER_PATH="./"
+export PYTHONPATH="${PYTHONPATH}:$(realpath $LLAMA_CPP_CANISTER_PATH)"
+
+#######################################################################
+# run from parent folder as:
+# scripts/upload-model.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+NUM_LLMS_DEPLOYED=1
+
+# The gguf model file to upload (Relative to llama_cpp_canister folder)
+MODEL="models/stories260Ktok512.gguf"
+# MODEL="models/stories15Mtok4096.gguf"
+# MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf"
+# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+#######################################################################
+echo " "
+echo "==================================================="
+echo "Uploading model for $NUM_LLMS_DEPLOYED llms"
+llm_id_start=0
+llm_id_end=$((NUM_LLMS_DEPLOYED - 1))
+
+for i in $(seq $llm_id_start $llm_id_end)
+do
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Checking health endpoint for llm_$i"
+    output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i health check failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i health check failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i health check succeeded."
+    fi
+
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Upload the model ($MODEL) to llm_$i"
+    python -m scripts.upload --network $NETWORK_TYPE --canister llm_$i --canister-filename models/model.gguf $MODEL
+
+    if [ $? -ne 0 ]; then
+        echo "scripts.upload for llm_$i exited with an error. Exiting the bash script."
+        echo $?
+        echo "****************************************************************"
+        echo "scripts.upload for llm_$i exited with an error. Exiting the bash script."
+        echo "****************************************************************"
+        exit 1
+    fi
+done
\ No newline at end of file
diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh
new file mode 100755
index 0000000..3b65267
--- /dev/null
+++ b/scripts/4-load-model.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/load-model.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+NUM_LLMS_DEPLOYED=1
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+#######################################################################
+echo " "
+echo "==================================================="
+echo "Loading model for $NUM_LLMS_DEPLOYED llms"
+llm_id_start=0
+llm_id_end=$((NUM_LLMS_DEPLOYED - 1))
+
+for i in $(seq $llm_id_start $llm_id_end)
+do
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Checking health endpoint for llm_$i"
+    output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i health check failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i health check failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i health check succeeded."
+    fi
+
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Calling load_model for llm_$i"
+    output=$(dfx canister call llm_$i load_model \
+            '(record { args = vec {"--model"; "models/model.gguf";} })' \
+            --network "$NETWORK_TYPE")
+
+    if ! echo "$output" | grep -q " Ok "; then
+        echo "llm_$i load_model failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i load_model failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i load_model succeeded."
+        echo 🎉
+    fi
+done
\ No newline at end of file
diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh
new file mode 100755
index 0000000..d85659c
--- /dev/null
+++ b/scripts/5-set-max-tokens.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/load-model.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+NUM_LLMS_DEPLOYED=1
+
+MAX_TOKENS=128 # stories260Ktok512.gguf
+# MAX_TOKENS=60 # stories15Mtok4096.gguf
+# MAX_TOKENS=25 # SmolLM2-135M-Instruct-Q4_K_M.gguf
+# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+#######################################################################
+echo " "
+echo "==================================================="
+echo "set_max_tokens to $MAX_TOKENS for $NUM_LLMS_DEPLOYED llms"
+llm_id_start=0
+llm_id_end=$((NUM_LLMS_DEPLOYED - 1))
+
+for i in $(seq $llm_id_start $llm_id_end)
+do
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Checking health endpoint for llm_$i"
+    output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i health check failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i health check failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i health check succeeded."
+    fi
+
+    echo " "
+    echo "--------------------------------------------------"
+    echo "Setting max tokens to ($MAX_TOKENS) for llm_$i"
+    output=$(dfx canister call llm_$i set_max_tokens \
+            '(record { max_tokens_query = '"$MAX_TOKENS"' : nat64; max_tokens_update = '"$MAX_TOKENS"' : nat64 })' \
+            --network "$NETWORK_TYPE")
+
+
+    if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+        echo "llm_$i set_max_tokens failed. Exiting."
+        echo $output
+        echo "****************************************************************"
+        echo "llm_$i set_max_tokens to $MAX_TOKENS failed. Exiting."
+        echo "****************************************************************"
+        exit 1
+    else
+        echo "llm_$i set_max_tokens to $MAX_TOKENS succeeded."
+        echo 🎉
+    fi
+done
\ No newline at end of file

From a663315ed8c94e68f4630309b98f54b2386e1f51 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Tue, 28 Jan 2025 13:29:36 -0500
Subject: [PATCH 13/25] tinystories is working in canister!

---
 README-contributors-guide.md   | 28 ++++++++++++++-----
 scripts/2-deploy-reinstall.sh  |  7 ++---
 scripts/2-deploy-upgrade.sh    |  5 +---
 scripts/3-upload-model.sh      | 15 ++++------
 scripts/4-load-model.sh        | 10 ++-----
 scripts/5-set-max-tokens.sh    | 16 ++++-------
 scripts/6-a-test-new-chat.sh   | 51 ++++++++++++++++++++++++++++++++++
 scripts/6-b-test-run-update.sh | 38 +++++++++++++++++++++++++
 scripts/6-c-test-run-update.sh | 38 +++++++++++++++++++++++++
 9 files changed, 163 insertions(+), 45 deletions(-)
 create mode 100755 scripts/6-a-test-new-chat.sh
 create mode 100755 scripts/6-b-test-run-update.sh
 create mode 100755 scripts/6-c-test-run-update.sh

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 48d98af..eec4edc 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -243,6 +243,7 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
     // ICPP-PATCH-END
     ... several other references to validation_result
   ```
+- outcomment all uses of `getenv`
 
 #### llama_cpp_onicai_fork/src/llama-hparams.cpp
 - no modifications needed for the IC
@@ -257,6 +258,7 @@ meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-ol
   We are not loading the dynamic backends, because it is calling dlopen which results in
   undefined symbols during linking.
   We can skip it, because we already registered the CPU backend as a compile flag.
+- outcomment all calls to std::getenv
 
 #### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
 - add `#include "ic_api.h"`
@@ -330,7 +332,14 @@ make build-info-cpp-wasm
   #include <thread>
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
-No updates needed for icpp-pro
+- outcomment all uses of `getenv`:
+  ```C++
+    // ICPP-PATCH-START
+    // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
+    // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
+    sched->debug = 0;
+    // ICPP-PATCH-END
+  ```
 
 #### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
 - outcomment all code related to threading
@@ -423,13 +432,18 @@ No updates needed for icpp-pro
 - outcomment try-catch
 
 #### llama_cpp_onicai_fork/common/common.h
-- Modify this:
+- Modify these:
 ```
-// ICPP-PATCH-START
-// We do NOT load a default model into the canister
-// #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-#define DEFAULT_MODEL_PATH ""
-// ICPP-PATCH-END
+    // ICPP-PATCH-START
+    // bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mmap          = false;  // not in a canister...
+    // ICPP-PATCH-END
+
+    // ICPP-PATCH-START
+    // We do NOT load a default model into the canister
+    // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+    #define DEFAULT_MODEL_PATH ""
+    // ICPP-PATCH-END
 ```
 
 #### llama_cpp_onicai_fork/common/chat-template.hpp
diff --git a/scripts/2-deploy-reinstall.sh b/scripts/2-deploy-reinstall.sh
index 926062f..fe8105d 100755
--- a/scripts/2-deploy-reinstall.sh
+++ b/scripts/2-deploy-reinstall.sh
@@ -64,11 +64,8 @@ do
     output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i health check failed. Exiting."
-        echo $output
-        echo "****************************************************************"
-        echo "llm_$i health check failed. Exiting."
-        echo "****************************************************************"
+        echo "llm_$i health check failed."
+        echo $output        
         exit 1
     else
         echo "llm_$i health check succeeded."
diff --git a/scripts/2-deploy-upgrade.sh b/scripts/2-deploy-upgrade.sh
index 9c3aec0..04e5297 100755
--- a/scripts/2-deploy-upgrade.sh
+++ b/scripts/2-deploy-upgrade.sh
@@ -64,11 +64,8 @@ do
     output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i health check failed. Exiting."
+        echo "llm_$i health check failed."
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i health check failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i health check succeeded."
diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh
index 1a46c0a..5787ab7 100755
--- a/scripts/3-upload-model.sh
+++ b/scripts/3-upload-model.sh
@@ -13,10 +13,11 @@ NETWORK_TYPE="local"
 NUM_LLMS_DEPLOYED=1
 
 # The gguf model file to upload (Relative to llama_cpp_canister folder)
-MODEL="models/stories260Ktok512.gguf"
+# MODEL="models/stories260Ktok512.gguf"
 # MODEL="models/stories15Mtok4096.gguf"
 # MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf"
-# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
+MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
+# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
 
 # Parse command line arguments for network type
 while [ $# -gt 0 ]; do
@@ -56,11 +57,8 @@ do
     output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i health check failed. Exiting."
+        echo "llm_$i health check failed."
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i health check failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i health check succeeded."
@@ -72,11 +70,8 @@ do
     python -m scripts.upload --network $NETWORK_TYPE --canister llm_$i --canister-filename models/model.gguf $MODEL
 
     if [ $? -ne 0 ]; then
-        echo "scripts.upload for llm_$i exited with an error. Exiting the bash script."
+        echo "scripts.upload for llm_$i exited with an error."
         echo $?
-        echo "****************************************************************"
-        echo "scripts.upload for llm_$i exited with an error. Exiting the bash script."
-        echo "****************************************************************"
         exit 1
     fi
 done
\ No newline at end of file
diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh
index 3b65267..67cd9d0 100755
--- a/scripts/4-load-model.sh
+++ b/scripts/4-load-model.sh
@@ -47,11 +47,8 @@ do
     output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i health check failed. Exiting."
+        echo "llm_$i health check failed."
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i health check failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i health check succeeded."
@@ -65,11 +62,8 @@ do
             --network "$NETWORK_TYPE")
 
     if ! echo "$output" | grep -q " Ok "; then
-        echo "llm_$i load_model failed. Exiting."
+        echo "llm_$i load_model failed."
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i load_model failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i load_model succeeded."
diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh
index d85659c..f3ee2e1 100755
--- a/scripts/5-set-max-tokens.sh
+++ b/scripts/5-set-max-tokens.sh
@@ -9,10 +9,10 @@
 NETWORK_TYPE="local"
 NUM_LLMS_DEPLOYED=1
 
-MAX_TOKENS=128 # stories260Ktok512.gguf
+# MAX_TOKENS=128 # stories260Ktok512.gguf
 # MAX_TOKENS=60 # stories15Mtok4096.gguf
-# MAX_TOKENS=25 # SmolLM2-135M-Instruct-Q4_K_M.gguf
-# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf
+# MAX_TOKENS=20 # SmolLM2-135M-Instruct-Q4_K_M.gguf
+MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf
 
 # Parse command line arguments for network type
 while [ $# -gt 0 ]; do
@@ -52,11 +52,8 @@ do
     output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i health check failed. Exiting."
+        echo "llm_$i health check failed"
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i health check failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i health check succeeded."
@@ -71,11 +68,8 @@ do
 
 
     if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
-        echo "llm_$i set_max_tokens failed. Exiting."
+        echo "llm_$i set_max_tokens failed."
         echo $output
-        echo "****************************************************************"
-        echo "llm_$i set_max_tokens to $MAX_TOKENS failed. Exiting."
-        echo "****************************************************************"
         exit 1
     else
         echo "llm_$i set_max_tokens to $MAX_TOKENS succeeded."
diff --git a/scripts/6-a-test-new-chat.sh b/scripts/6-a-test-new-chat.sh
new file mode 100755
index 0000000..0be92dc
--- /dev/null
+++ b/scripts/6-a-test-new-chat.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/test.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+i=0 # llm_$i will be tested
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+echo " "
+echo "--------------------------------------------------"
+echo "Checking health endpoint for llm_$i"
+output=$(dfx canister call llm_$i health --network $NETWORK_TYPE )
+
+if [ "$output" != "(variant { Ok = record { status_code = 200 : nat16 } })" ]; then
+    echo "llm_$i health check failed."
+    echo $output
+    exit 1
+else
+    echo "llm_$i health check succeeded."
+fi
+
+echo " "
+echo "--------------------------------------------------"
+echo "Calling new_chat for llm_$i"
+dfx canister call llm_$i new_chat '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' --network $NETWORK_TYPE
\ No newline at end of file
diff --git a/scripts/6-b-test-run-update.sh b/scripts/6-b-test-run-update.sh
new file mode 100755
index 0000000..a0ca4a5
--- /dev/null
+++ b/scripts/6-b-test-run-update.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/test.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+i=0 # llm_$i will be tested
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+echo " "
+echo "--------------------------------------------------"
+echo "Calling run_update for llm_$i"
+dfx canister call llm_$i run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is the difference between a chicken and a turkey.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })'
diff --git a/scripts/6-c-test-run-update.sh b/scripts/6-c-test-run-update.sh
new file mode 100755
index 0000000..d6a1cd2
--- /dev/null
+++ b/scripts/6-c-test-run-update.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/test.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+i=0 # llm_$i will be tested
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+echo " "
+echo "--------------------------------------------------"
+echo "Calling run_update for llm_$i"
+dfx canister call llm_$i run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })'

From cc5a3266c0fb759dc80a08e88bf8cd7476751be8 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Wed, 29 Jan 2025 15:58:58 -0500
Subject: [PATCH 14/25] Some small updates

---
 README-0001-b841d0.md                         | 110 ++++
 README-0002-615212.md                         | 401 +++++++++++++
 README-contributors-guide.md                  | 542 +-----------------
 README.md                                     |  76 +--
 dfx.json                                      |  10 -
 dfx.multiple-llms.json                        |  21 +
 scripts/3-upload-model.sh                     |   5 +-
 scripts/4-load-model.sh                       |   2 +-
 scripts/5-set-max-tokens.sh                   |   3 +-
 .../{6-a-test-new-chat.sh => 6-new-chat.sh}   |   0
 scripts/7-deepseek-run-update-a.sh            |  39 ++
 scripts/7-deepseek-run-update-b.sh            |  38 ++
 ...t-run-update.sh => 7-qwen-run-update-a.sh} |   0
 ...t-run-update.sh => 7-qwen-run-update-b.sh} |   0
 scripts/prompt-design.ipynb                   |  21 +-
 test/test_canister_functions.py               |  26 +-
 16 files changed, 686 insertions(+), 608 deletions(-)
 create mode 100644 README-0001-b841d0.md
 create mode 100644 README-0002-615212.md
 create mode 100644 dfx.multiple-llms.json
 rename scripts/{6-a-test-new-chat.sh => 6-new-chat.sh} (100%)
 create mode 100755 scripts/7-deepseek-run-update-a.sh
 create mode 100755 scripts/7-deepseek-run-update-b.sh
 rename scripts/{6-b-test-run-update.sh => 7-qwen-run-update-a.sh} (100%)
 rename scripts/{6-c-test-run-update.sh => 7-qwen-run-update-b.sh} (100%)

diff --git a/README-0001-b841d0.md b/README-0001-b841d0.md
new file mode 100644
index 0000000..4174002
--- /dev/null
+++ b/README-0001-b841d0.md
@@ -0,0 +1,110 @@
+# DETAILS FOR UPGRADE from llama.cpp sha `615212` to `b841d0`
+
+### cpp_paths
+
+#### main_.cpp
+`meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp`
+- use `main_` instead of `main`
+- A few items related to console & ctrl+C need to be outcommented
+
+
+#### llama_cpp_onicai_fork/src/llama.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format`
+- replace `throw` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment threading related items:
+  - `#include <future>`
+  - `#include <mutex>`
+  - `#include <thread>`
+- outcomment these functions completely:
+  - `llama_tensor_quantize_internal`
+  - `llama_model_quantize_internal`
+
+
+#### llama_cpp_onicai_fork/src/llama-vocab.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error(format` with `IC_API::trap(std::string("RUNTIME ERROR: ") + format`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- add a check on `llama_token_bos(model)`, else the llama2.c models never stop generating:
+  ```
+  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
+      return token != -1 && (
+          token == llama_token_eos_impl(vocab) ||
+          token == llama_token_eot_impl(vocab) || 
+          token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos
+      );
+  }
+  ```
+
+#### llama_cpp_onicai_fork/src/llama-grammar.cpp
+No changes needed
+
+#### llama_cpp_onicai_fork/src/llama-sampling.cpp
+No changes needed
+
+#### llama_cpp_onicai_fork/src/unicode-data.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/unicode.cpp
+- add `#include "ic_api.h"`
+- replace `throw` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
+- add `#include "ic_api.h"`
+- replace `throw` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+
+#### llama_cpp_onicai_fork/common/build-info.cpp
+- run this command to create it:
+```
+make build-info-cpp-wasm
+``` 
+
+#### llama_cpp_onicai_fork/common/grammar-parser.cpp
+- add `#include "ic_api.h"`
+- replace `throw` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/common/sampling.cpp
+- add `#include "ic_api.h"`
+- replace `throw` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/common/common.cpp
+- add `#include "ic_api.h"`
+- replace `throw` with `IC_API::trap`
+- outcomment all code related to `<pthread.h>`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment `std::getenv`
+
+
+---
+### c_paths
+
+#### llama_cpp_onicai_fork/ggml/src/ggml.c
+- outcomment all code related to signals
+  - `#include <signal.h>`
+- Many threading outcomments. 
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c
+No updates needed for icpp-pro
+
+---
+### headers to modify
+
+#### llama_cpp_onicai_fork/common/log.h
+- `#include <thread>`
+- Some other threading code
+
+#### llama_cpp_onicai_fork/common/common.h
+- `#include <thread>`
\ No newline at end of file
diff --git a/README-0002-615212.md b/README-0002-615212.md
new file mode 100644
index 0000000..ba98c79
--- /dev/null
+++ b/README-0002-615212.md
@@ -0,0 +1,401 @@
+# DETAILS FOR UPGRADE from llama.cpp sha `b841d0` to `615212`
+
+### cpp_paths
+
+#### main_.cpp
+
+```bash
+# from folder: llama_cpp_canister/src
+
+# To do the actual changes
+meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp
+
+# To check what has changed between <git-sha-new> and <git-sha-old>
+meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_<git-sha-old>/examples/main/main.cpp
+```
+- use `main_` instead of `main`
+- A few items related to console, ctrl+C & threading need to be outcommented
+- Added logic for running in a canister with multiple update calls
+
+
+#### llama_cpp_onicai_fork/src/llama.cpp
+```bash
+# from folder: llama_cpp_canister/src
+# To do the actual changes
+meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama.cpp
+```
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment threading related items
+- outcomment these functions completely:
+  - `llama_tensor_quantize_internal`
+  - `llama_model_quantize_internal`
+
+
+#### llama_cpp_onicai_fork/src/llama-vocab.cpp
+```bash
+# from folder: llama_cpp_canister/src
+meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama-vocab.cpp
+```
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-grammar.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-sampling.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/llama-impl.cpp
+- no modifications needed for the IC
+
+#### src/llama_cpp_onicai_fork/src/llama-context.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### src/llama_cpp_onicai_fork/src/llama-arch.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/unicode-data.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/unicode.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-kv-cache.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/llama-chat.cpp
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-mmap.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/llama-model.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-batch.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/src/llama-adapter.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/src/llama-model-loader.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment all uses of `validation_result`:
+  ```C++
+    // ICPP-PATCH-START
+    // we do not support check_tensors. It requires threading.
+    // std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
+    // ICPP-PATCH-END
+    ... several other references to validation_result
+  ```
+- outcomment all uses of `getenv`
+
+#### llama_cpp_onicai_fork/src/llama-hparams.cpp
+- no modifications needed for the IC
+
+#### llama_cpp_onicai_fork/common/arg.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment args that require `std::thread`
+- outcomment call to `ggml_backend_load_all();`
+  We are not loading the dynamic backends, because it is calling dlopen which results in
+  undefined symbols during linking.
+  We can skip it, because we already registered the CPU backend as a compile flag.
+- outcomment all calls to std::getenv
+
+#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::out_of_range` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/common/build-info.cpp
+- run this command to create it:
+```
+make build-info-cpp-wasm
+``` 
+
+#### llama_cpp_onicai_fork/common/sampling.cpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/common/common.cpp
+- add right below `#include llama.h`:
+  ```C++
+    // ICPP-PATCH-START
+    #include "ic_api.h"
+    extern llama_model ** g_model; // The global variable from main_.cpp
+    // ICPP-PATCH-END
+  ```
+- In common_init_result, skip loading the model if the --model parameter is not provided:
+  ```C++
+    // ICPP-PATCH-START
+    // Skip loading the model if the --model parameter is not provided
+    if (!params.model.empty()) {
+    // ICPP-PATCH-END
+
+    ... 
+    model = ...
+    ...
+
+    // ICPP-PATCH-START
+    // Skip loading the model if the --model parameter is not provided
+    } else {
+        // Access the model through g_model and assign it to the local variable
+        model = *g_model;
+    }
+    // ICPP-PATCH-END
+  ```
+- In common_init_result, do NOT transfer ownership of the model pointer:
+  ```C++
+    // ICPP-PATCH-START: 
+    // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model
+    // We do NOT want the model to be freed when the unique_ptr goes out of scope
+    // iparams.model.reset(model);
+    // ICPP-PATCH-END
+  ```
+- replace `throw std::runtime_error` with `IC_API::trap`
+- replace `throw std::invalid_argument` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+- outcomment `std::getenv`
+  Compare to changes made last time (!)
+
+- outcomment all code related to `<pthread.h>`:
+  Compare to changes made last time (!)
+  - cpu_get_num_physical_cores
+
+- outcomment #ifdef LLAMA_USE_CURL
+  Compare to changes made last time (!)
+
+- outcomment `set_process_priority` function
+
+#### llama_cpp_onicai_fork/common/log.cpp
+- Remove all threading logic
+  #include <mutex>
+  #include <thread>
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
+- outcomment all uses of `getenv`:
+  ```C++
+    // ICPP-PATCH-START
+    // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
+    // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
+    sched->debug = 0;
+    // ICPP-PATCH-END
+  ```
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
+- outcomment all code related to threading
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
+- Update dl_handle_deleter, to avoid a call to dlclose that should never happen
+  The linker ends up with undefined if we don't outcomment it
+  ```C++
+  #include "ic_api.h"
+  struct dl_handle_deleter {
+    void operator()(void * handle) {
+        // ICPP-PATCH-START
+        // We are NOT dynamically loading any backend
+        // SO WE SHOULD NEVER GET HERE
+        // Avoid linker error by outcommenting this, but inserting a runtime trap
+        // dlclose(handle);
+        IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called");
+        // ICPP-PATCH-END
+      }
+  };
+  ```
+
+#### llama_cpp_onicai_fork/ggml/src/gguf.cpp
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp
+- outcomment all code related to signals & threading:
+  - `#include "ggml-threading.h"`
+  - `#include <signal.h>`
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+No updates needed for icpp-pro
+
+---
+### c_paths
+
+#### llama_cpp_onicai_fork/ggml/src/ggml.c
+- outcomment all code related to signals & threading
+  - `#include "ggml-threading.h"`
+  - `#include <signal.h>`
+
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c
+No updates needed for icpp-pro
+
+#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c
+No updates needed for icpp-pro
+
+---
+### headers to modify
+
+#### llama_cpp_onicai_fork/src/llama-model-loader.h
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+
+#### llama_cpp_onicai_fork/src/minja.hpp
+- add `#include "ic_api.h"`
+- replace `throw std::runtime_error` with `IC_API::trap`
+- re-define two functions:
+  ```C++
+    // ICPP-PATCH-START
+    // throw not supported, using IC_API::trap instead, which expects a string
+    // std::runtime_error unexpected(const TemplateToken & token) const {
+    //   return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type)
+    //     + error_location_suffix(*template_str, token.location.pos));
+    // }
+    // std::runtime_error unterminated(const TemplateToken & token) const {
+    //   return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type)
+    //     + error_location_suffix(*template_str, token.location.pos));
+    // }
+    std::string unexpected(const TemplateToken & token) const {
+      return ("Unexpected " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+    std::string unterminated(const TemplateToken & token) const {
+      return ("Unterminated " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+    // ICPP-PATCH-END
+  ```
+- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))`
+- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))`
+- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))`
+- outcomment try-catch
+
+#### llama_cpp_onicai_fork/common/common.h
+- Modify these:
+```
+    // ICPP-PATCH-START
+    // bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mmap          = false;  // not in a canister...
+    // ICPP-PATCH-END
+
+    // ICPP-PATCH-START
+    // We do NOT load a default model into the canister
+    // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+    #define DEFAULT_MODEL_PATH ""
+    // ICPP-PATCH-END
+```
+
+#### llama_cpp_onicai_fork/common/chat-template.hpp
+- replace `throw std::runtime_error` with `IC_API::trap`
+- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+
+#### llama_cpp_onicai_fork/ggml/include/ggml.h
+- #define GGML_DEFAULT_N_THREADS  1
+
+------------
+TODO: search in code files for: TODO-615212
+
+(-) main_.cpp has a new static `global g_smpl`:
+    static common_sampler          ** g_smpl;
+
+    Q: Does this need to become a global variable, accessible from common.cpp ?
+       Like we did for g_model ?
+
+       In `common/common.cpp` we added:    
+        ```
+        // ICPP-PATCH-START
+        #include "ic_api.h"
+        extern llama_model ** g_model; // The global variable from main_.cpp
+        // ICPP-PATCH-END
+        ```
+
+(-) main_.cpp renamed type for `g_params`:
+    from: static gpt_params               * g_params;
+    to  : static common_params            * g_params;
+
+    Q: Does this need to become a global variable, accessible from common.cpp ?
+       Like we did for g_model ?
+
+(-) main_.cpp line 142: common_sampler * smpl = nullptr;
+
+    Q: Does `smpl` need to become a static variable, like `model` & `ctx` ?
+
+(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going
+
+    Q: Is this logic for prompt_remaining still valid?
+
+(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT
+    -> LOG is used just for Console/Stream Output
+    -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes...
+
+    Q1: Did we change anything to LOG & LOG_TEE to get it to work ?
+    Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it.
+    Q3: Can we remove the LOG & LOG_TEE 
+    Q4: Do we need to update the README about downloading different LOG files?
+
+(-) llama-vocab.cpp  --- This function is no longer there. Is tinystories still working?
+
+    We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating:
+      ```
+      bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
+          return token != -1 && (
+              token == llama_token_eos_impl(vocab) ||
+              token == llama_token_eot_impl(vocab) || 
+              token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos
+          );
+      }
+      ```
+
+(-) TODO: `llama_cpp_onicai_fork/common/log.cpp` step through the logic
+          - Remove the pause() function
+          - Remove the cur.is_end function ?
+
+(-) TODO: Monitor memory, and make sure that ctx is freed up...
+    See free_ctx() method that has been outcommented in main_.cpp
+    
+----------------------------------------------------------
+NOTES: 
+
+(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp`
+
+(-) All the LLM architectures supported by llama_cpp_canister are listed in 
+    `src/llama_cpp_onicai_fork/src/llama-arch.cpp`
+
+(-) NOTE: `common/grammar-parser.cpp` is no longer there.
+          It appears to be fully included in `src/llama-grammar.cpp`
+
+(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c`
+
+(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists
+          Previous update: No updates needed for icpp-pro
+
+(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time:
+          Previous update:
+          - `#include <thread>`
+          - Some other threading code
+
+(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time:
+          Previous update:
+          - `#include <thread>`
\ No newline at end of file
diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index eec4edc..16415b0 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -4,75 +4,6 @@
 
 Follow steps of [llama_cpp_canister/README/Getting Started](https://github.com/onicai/llama_cpp_canister/blob/main/README.md#getting-started)
 
-# VS Code debugger
-
-## lldb-mi hangs
-
-On the Mac, there is an issue with lldb-mi: https://github.com/microsoft/vscode-cpptools/issues/7240
-
-Upon stopping at a breakpoint in a new module, lldb-mi will try to load all local variables, and it goes into an endless loop.
-
-The solution is to hide the VARIABLES section in the debug window, and rely on the WATCH section instead.
-
-# How to run & debug original llama.cpp
-
-- Clone ggerganov/llama.cpp  (Do NOT initialize submodules...)
-  ```
-  # Clone it as a sibling repo of llama_cpp_canister
-  git clone https://github.com/ggerganov/llama.cpp.git
-  ```
-- Checkout the proper commit used as root of the onicai branch in llama_cpp_onicai_fork
-  ```
-  git checkout b841d0
-  ```
-- Build with these commands:
-  ```
-  make clean
-  make LLAMA_DEBUG=1 llama-cli
-  ```
-- Run with Notebook
-
-  File: scripts/prompt-design.ipynb
-
-- Run with this command:
-  ```
-  ./llama-cli -m ../llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512 -fa -ngl 80
-  ```
-  In above command, the `-fa -ngl 80` arguments are useful only on GPU. We do not use them when calling the IC, because
-  the canister has a CPU only.
-  
-- Debug using this `.vscode/launch.json`
-  ```json
-  {
-      // Use IntelliSense to learn about possible attributes.
-      // Hover to view descriptions of existing attributes.
-      // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-      "version": "0.2.0",
-      "configurations": [
-          {
-              "type": "lldb",
-              "request": "launch",
-              "name": "llama-cli",
-              "program": "${workspaceFolder}/llama-cli",
-              "cwd": "${workspaceFolder}",
-              "args": [
-                  "-m",
-                  "<PATH_TO>/llama_cpp_canister_models/stories260Ktok512.gguf",
-                  "--samplers",
-                  "top_p",
-                  "--temp",
-                  "0.1",
-                  "--top-p",
-                  "0.9",
-                  "-n",
-                  "600",
-                  "-p",
-                  "Joe loves writing stories"
-              ]
-          }
-      ]
-  }
-  ```
 # How to upgrade llama.cpp
 
 ## Sync fork
@@ -95,18 +26,20 @@ git fetch upstream --tags
 
 # after this, the tags will appear in GitHub
 git push origin --tags
-
-
 ```
 
 ## llama_cpp_onicai_fork: setup a local branch
 Take following steps locally:
 - git fetch 
 
-- This is the git-sha of the llama.cpp versions we branched from:
-  - `615212` (git-sha-new)  , with release-tag `b4532`
-  - `b841d0` (git-sha-old)  , no   release-tag
-  - `5cdb37` (git-sha-older), no   release-tag
+- These are the git-sha values of the llama.cpp versions we branched from:
+
+  | upgrade # | llama.cpp sha | llama.cpp release-tag |
+  | --------- | ------------- | --------------------- |
+  |    0000   |     5cdb37    |         -             |
+  |    0001   |     b841d0    |         -             |
+  |    0002   |     615212    |         b4532         |
+
 
 - Start with a fresh clone of llama_cpp_onicai_fork:
   ```bash
@@ -139,332 +72,15 @@ We use `meld` for comparing the files:
 brew install --cask dehesselle-meld
 ```
 
-### cpp_paths
-
-#### main_.cpp
-
-```bash
-# from folder: llama_cpp_canister/src
-
-# To do the actual changes
-meld main_.cpp llama_cpp_onicai_fork/examples/main/main.cpp
-
-# To check what has changed between <git-sha-new> and <git-sha-old>
-meld llama_cpp_onicai_fork/examples/main/main.cpp llama_cpp_onicai_fork_<git-sha-old>/examples/main/main.cpp
-```
-- use `main_` instead of `main`
-- A few items related to console, ctrl+C & threading need to be outcommented
-- Added logic for running in a canister with multiple update calls
-
-
-#### llama_cpp_onicai_fork/src/llama.cpp
-```bash
-# from folder: llama_cpp_canister/src
-# To do the actual changes
-meld llama_cpp_onicai_fork/src/llama.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama.cpp
-```
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-- outcomment threading related items
-- outcomment these functions completely:
-  - `llama_tensor_quantize_internal`
-  - `llama_model_quantize_internal`
-
-
-#### llama_cpp_onicai_fork/src/llama-vocab.cpp
-```bash
-# from folder: llama_cpp_canister/src
-meld llama_cpp_onicai_fork/src/llama-vocab.cpp llama_cpp_onicai_fork_<git-sha-old>/src/llama-vocab.cpp
-```
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-grammar.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-sampling.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-
-#### llama_cpp_onicai_fork/src/llama-impl.cpp
-- no modifications needed for the IC
-
-#### src/llama_cpp_onicai_fork/src/llama-context.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-
-#### src/llama_cpp_onicai_fork/src/llama-arch.cpp
-- no modifications needed for the IC
-
-#### llama_cpp_onicai_fork/src/unicode-data.cpp
-- no modifications needed for the IC
-
-#### llama_cpp_onicai_fork/src/unicode.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- replace `throw std::invalid_argument` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-kv-cache.cpp
-- no modifications needed for the IC
-
-#### llama_cpp_onicai_fork/src/llama-chat.cpp
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-mmap.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-
-#### llama_cpp_onicai_fork/src/llama-model.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-batch.cpp
-- no modifications needed for the IC
-
-#### llama_cpp_onicai_fork/src/llama-adapter.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/src/llama-model-loader.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment all uses of `validation_result`:
-  ```C++
-    // ICPP-PATCH-START
-    // we do not support check_tensors. It requires threading.
-    // std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
-    // ICPP-PATCH-END
-    ... several other references to validation_result
-  ```
-- outcomment all uses of `getenv`
-
-#### llama_cpp_onicai_fork/src/llama-hparams.cpp
-- no modifications needed for the IC
-
-#### llama_cpp_onicai_fork/common/arg.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- replace `throw std::invalid_argument` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-- outcomment args that require `std::thread`
-- outcomment call to `ggml_backend_load_all();`
-  We are not loading the dynamic backends, because it is calling dlopen which results in
-  undefined symbols during linking.
-  We can skip it, because we already registered the CPU backend as a compile flag.
-- outcomment all calls to std::getenv
-
-#### llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- replace `throw std::out_of_range` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/common/build-info.cpp
-- run this command to create it:
-```
-make build-info-cpp-wasm
-``` 
-
-#### llama_cpp_onicai_fork/common/sampling.cpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-
-#### llama_cpp_onicai_fork/common/common.cpp
-- add right below `#include llama.h`:
-  ```C++
-    // ICPP-PATCH-START
-    #include "ic_api.h"
-    extern llama_model ** g_model; // The global variable from main_.cpp
-    // ICPP-PATCH-END
-  ```
-- In common_init_result, skip loading the model if the --model parameter is not provided:
-  ```C++
-    // ICPP-PATCH-START
-    // Skip loading the model if the --model parameter is not provided
-    if (!params.model.empty()) {
-    // ICPP-PATCH-END
-
-    ... 
-    model = ...
-    ...
-
-    // ICPP-PATCH-START
-    // Skip loading the model if the --model parameter is not provided
-    } else {
-        // Access the model through g_model and assign it to the local variable
-        model = *g_model;
-    }
-    // ICPP-PATCH-END
-  ```
-- In common_init_result, do NOT transfer ownership of the model pointer:
-  ```C++
-    // ICPP-PATCH-START: 
-    // 'reset' transfers ownership of the model pointer to the std::unique_ptr iparams.model
-    // We do NOT want the model to be freed when the unique_ptr goes out of scope
-    // iparams.model.reset(model);
-    // ICPP-PATCH-END
-  ```
-- replace `throw std::runtime_error` with `IC_API::trap`
-- replace `throw std::invalid_argument` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-- outcomment `std::getenv`
-  Compare to changes made last time (!)
-
-- outcomment all code related to `<pthread.h>`:
-  Compare to changes made last time (!)
-  - cpu_get_num_physical_cores
-
-- outcomment #ifdef LLAMA_USE_CURL
-  Compare to changes made last time (!)
-
-- outcomment `set_process_priority` function
-
-#### llama_cpp_onicai_fork/common/log.cpp
-- Remove all threading logic
-  #include <mutex>
-  #include <thread>
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp
-- outcomment all uses of `getenv`:
-  ```C++
-    // ICPP-PATCH-START
-    // const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
-    // sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
-    sched->debug = 0;
-    // ICPP-PATCH-END
-  ```
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-threading.cpp
-- outcomment all code related to threading
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-backend-reg.cpp
-- Update dl_handle_deleter, to avoid a call to dlclose that should never happen
-  The linker ends up with undefined if we don't outcomment it
-  ```C++
-  #include "ic_api.h"
-  struct dl_handle_deleter {
-    void operator()(void * handle) {
-        // ICPP-PATCH-START
-        // We are NOT dynamically loading any backend
-        // SO WE SHOULD NEVER GET HERE
-        // Avoid linker error by outcommenting this, but inserting a runtime trap
-        // dlclose(handle);
-        IC_API::trap("THIS SHOULD NEVER HAPPEN - dl_handle_deleter::operator() called");
-        // ICPP-PATCH-END
-      }
-  };
-  ```
-
-#### llama_cpp_onicai_fork/ggml/src/gguf.cpp
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.cpp
-- outcomment all code related to signals & threading:
-  - `#include "ggml-threading.h"`
-  - `#include <signal.h>`
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
-No updates needed for icpp-pro
-
----
-### c_paths
-
-#### llama_cpp_onicai_fork/ggml/src/ggml.c
-- outcomment all code related to signals & threading
-  - `#include "ggml-threading.h"`
-  - `#include <signal.h>`
-
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-alloc.c
-No updates needed for icpp-pro
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-quants.c
-No updates needed for icpp-pro
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu.c
-No updates needed for icpp-pro
-
-#### llama_cpp_onicai_fork/ggml/src/ggml-cpu/ggml-cpu-quants.c
-No updates needed for icpp-pro
-
----
-### headers to modify
-
-#### llama_cpp_onicai_fork/src/llama-model-loader.h
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-
-#### llama_cpp_onicai_fork/src/minja.hpp
-- add `#include "ic_api.h"`
-- replace `throw std::runtime_error` with `IC_API::trap`
-- re-define two functions:
-  ```C++
-    // ICPP-PATCH-START
-    // throw not supported, using IC_API::trap instead, which expects a string
-    // std::runtime_error unexpected(const TemplateToken & token) const {
-    //   return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type)
-    //     + error_location_suffix(*template_str, token.location.pos));
-    // }
-    // std::runtime_error unterminated(const TemplateToken & token) const {
-    //   return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type)
-    //     + error_location_suffix(*template_str, token.location.pos));
-    // }
-    std::string unexpected(const TemplateToken & token) const {
-      return ("Unexpected " + TemplateToken::typeToString(token.type)
-        + error_location_suffix(*template_str, token.location.pos));
-    }
-    std::string unterminated(const TemplateToken & token) const {
-      return ("Unterminated " + TemplateToken::typeToString(token.type)
-        + error_location_suffix(*template_str, token.location.pos));
-    }
-    // ICPP-PATCH-END
-  ```
-- replace `throw unterminated(**start)` with `IC_API::trap(unterminated(**start))`
-- replace `throw unexpected(**(it-1))` with `IC_API::trap(unexpected(**(it-1)))`
-- replace `throw unexpected(**(it))` with `IC_API::trap(unexpected(**(it)))`
-- outcomment try-catch
-
-#### llama_cpp_onicai_fork/common/common.h
-- Modify these:
-```
-    // ICPP-PATCH-START
-    // bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mmap          = false;  // not in a canister...
-    // ICPP-PATCH-END
-
-    // ICPP-PATCH-START
-    // We do NOT load a default model into the canister
-    // #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-    #define DEFAULT_MODEL_PATH ""
-    // ICPP-PATCH-END
-```
-
-#### llama_cpp_onicai_fork/common/chat-template.hpp
-- replace `throw std::runtime_error` with `IC_API::trap`
-- outcomment `try - catch`. The program will abrupt in case of thrown exceptions.
+## Details for each upgrade
 
-#### llama_cpp_onicai_fork/ggml/include/ggml.h
-- #define GGML_DEFAULT_N_THREADS  1
+See the files: README-<upgrade #>-<llama.cpp sha>.md
 
-## llama_cpp_onicai_fork: replace `onicai` branch
+## Branch management
 
-TODO: RETHINK THIS LOGIC...
-(-) Perhaps it is better to keep all the `onicai-<git-sha-...>` branches
-(-) And just change the default branch to `onicai-<git-sha-new>`
+We need to rethink this logic, but for now it is ok...
 
-That way:
-(-) when someone clones, the are at the correct branch
-(-) from the name, it is immediately clear what llama.cpp version was used
-(-) we preserve the full history
-
----
+### llama_cpp_onicai_fork
 Do NOT merge the `onicai-<git-sha>` branch into the `onicai` branch, but replace it:
 
 ```
@@ -474,134 +90,6 @@ git push origin onicai:onicai
 git push origin onicai-<git-sha-old>:onicai-<git-sha-old>
 ```
 
+## llama_cpp_canister
 
-------------
-TODO: search in code files for: TODO-615212
-
-(-) main_.cpp has a new static `global g_smpl`:
-    static common_sampler          ** g_smpl;
-
-    Q: Does this need to become a global variable, accessible from common.cpp ?
-       Like we did for g_model ?
-
-       In `common/common.cpp` we added:    
-        ```
-        // ICPP-PATCH-START
-        #include "ic_api.h"
-        extern llama_model ** g_model; // The global variable from main_.cpp
-        // ICPP-PATCH-END
-        ```
-
-(-) main_.cpp renamed type for `g_params`:
-    from: static gpt_params               * g_params;
-    to  : static common_params            * g_params;
-
-    Q: Does this need to become a global variable, accessible from common.cpp ?
-       Like we did for g_model ?
-
-(-) main_.cpp line 142: common_sampler * smpl = nullptr;
-
-    Q: Does `smpl` need to become a static variable, like `model` & `ctx` ?
-
-(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going
-
-    Q: Is this logic for prompt_remaining still valid?
-
-(-) main_.cpp line 208: // ICPP-TODO-START: This section is completely new...
-    COMPLETELY NEW SECTION FOR THREADPOOLs... 
-
-(-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT
-    -> LOG is used just for Console/Stream Output
-    -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes...
-
-    Q1: Did we change anything to LOG & LOG_TEE to get it to work ?
-    Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it.
-    Q3: Can we remove the LOG & LOG_TEE 
-    Q4: Do we need to update the README about downloading different LOG files?
-
-(-) main_.cpp calls common_token_to_piece instead of llama_token_to_piece
-
-    Q: Is this a new file:  common_token_to_piece
-    A: No, it is in common.cpp
-
-(-) main_.cpp calls common_tokenize instead of llama_tokenize
-
-    Q: Is this a new file:  common_tokenize
-    A: No, it is in common.cpp
-
-(-) main_.cpp line 516, 826: New sampling subsystem !
-
-    Q: Are these new files: 
-       - common_sampler_init
-       - common_sampler_sample
-       - common_sampler_accept
-    A: No, it is in sampling.cpp
-
-(-) main_.cpp line 1123: common_sampler_free(smpl)
-
-    We had outcommented code to NOT free the ctx & model storage:
-    // Do NOT free ctx & model storage
-    // -> we made `ctx` & `model` data static, so they are maintained across calls to the LLM
-    // -> we do NOT reset g_ctx & g_model
-    // -> we moved this into a free_model function, which can be called by canister's load_model
-    // llama_free(ctx);
-    // llama_free_model(model);
-
-    // TODO-615212 -- Make sure this is correct
-    // Do reset all other static memory
-    reset_static_memory();
-
-    Q1: Has this all moved into common_sampler_free ?
-
-    Q2: Update usage of the free_model function?
-
-    Q3: is reset_static_memory still correct ? 
-    
-    Q4: Is llama_sampling_free(ctx_sampling) now handled by common_sampler_free(smpl) ?
-
-
-(-) llama-vocab.cpp  --- This function is no longer there. Is tinystories still working?
-
-    We had added a check on `llama_token_bos(model)`, else the llama2.c models never stop generating:
-      ```
-      bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-          return token != -1 && (
-              token == llama_token_eos_impl(vocab) ||
-              token == llama_token_eot_impl(vocab) || 
-              token == llama_token_bos_impl(vocab) // ICPP-PATCH: the llama2.c model predicts bos without first predicting an eos
-          );
-      }
-      ```
-
-(-) DEBUG: `llama_cpp_onicai_fork/common/log.cpp` step through the logic
-          - Remove the pause() function
-          - Remove the cur.is_end function ?
-
-(-) Monitor memory, and make sure that ctx is freed up...
-    See free_ctx() method that has been outcommented in main_.cpp
-    
-----------------------------------------------------------
-NOTES:
-
-(-) main_.cpp includes a new file: `llama_cpp_onicai_fork/common/chat-template.hpp`
-    This is from Google, and a general chat_template, with tool calling !!!
-
-(-) All the LLM architectures supported by llama_cpp_canister are listed in 
-    `src/llama_cpp_onicai_fork/src/llama-arch.cpp`
-
-(-) NOTE: `common/grammar-parser.cpp` is no longer there.
-          It appears to be fully included in `src/llama-grammar.cpp`
-
-(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-backend.cpp` used to be `llama_cpp_onicai_fork/ggml/src/ggml-backend.c`
-
-(-) NOTE: `llama_cpp_onicai_fork/ggml/src/ggml-aarch64.c` no longer exists
-          Previous update: No updates needed for icpp-pro
-
-(-) NOTE: `llama_cpp_onicai_fork/common/log.h` no update was needed this time:
-          Previous update:
-          - `#include <thread>`
-          - Some other threading code
-
-(-) NOTE: `llama_cpp_onicai_fork/common/common.h` no update was needed this time:
-          Previous update:
-          - `#include <thread>`
\ No newline at end of file
+Merge the `onicai-<git-sha>` branch into the `onicai` branch
\ No newline at end of file
diff --git a/README.md b/README.md
index bcd0397..5a3935f 100644
--- a/README.md
+++ b/README.md
@@ -28,13 +28,21 @@ Please join our [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aa
 
 # Capabilities 🔥
 
-- You can deploy LLMs up to ~0.5B parameters.
-- The full context window of the LLM is used. (128K tokens for the Qwen2.5 example below.) 
-
+- Deploy any LLM available as a gguf file.
+- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)).
 
 # Set up
 
-WARNING: Currently, the canister can only be build on a `Mac` ! 
+The build of the wasm must be done on a `Mac` ! 
+
+- Install dfx:
+
+   ```bash
+   sh -ci "$(curl -fsSL https://internetcomputer.org/install.sh)"
+
+   # Configure your shell
+   source "$HOME/.local/share/dfx/env"
+   ```
 
 - Clone the repo and it's children:
 
@@ -48,12 +56,6 @@ WARNING: Currently, the canister can only be build on a `Mac` !
    git clone git@github.com:onicai/llama_cpp_onicai_fork.git
    ```
 
-- Create the file src/llama_cpp_onicai_fork/common/build-info.cpp
-  ```
-  # from ./llama_cpp_canister folder
-  make build-info-cpp-wasm
-  ```
-
 - Create a Python environment with dependencies installed
   
   ❗❗❗ Use Python 3.11 ❗❗❗
@@ -70,26 +72,14 @@ WARNING: Currently, the canister can only be build on a `Mac` !
    pip install -r requirements.txt
    ```
 
-- Install dfx:
-
-   ```bash
-   sh -ci "$(curl -fsSL https://internetcomputer.org/install.sh)"
-
-   # Configure your shell
-   source "$HOME/.local/share/dfx/env"
-   ```
-
 - Build & Deploy the canister `llama_cpp`:
 
   - Compile & link to WebAssembly (wasm):
     ```bash
+    make build-info-cpp-wasm
     icpp build-wasm
     ```
-    Note: 
-    
-    The first time you run this command, the tool-chain will be installed in ~/.icpp
-    
-    This can take a few minutes, depending on your internet speed and computer.
+    Note: The first time you run this command, the tool-chain will be installed in ~/.icpp
 
   - Start the local network:
     ```bash
@@ -124,24 +114,22 @@ WARNING: Currently, the canister can only be build on a `Mac` !
     ```bash
     python -m scripts.upload --network local --canister llama_cpp --canister-filename models/model.gguf models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf
     ```
-  
-  - Only needed after a canister upgrade (`dfx deploy -m upgrade`), re-load the gguf file into Orthogonal Persisted (OP) working memory 
-  
-    This step is already done by scripts.upload above, so you can skip it if you just ran that.
 
-    After a canister upgrade, the gguf file in the canister is still there, because it is persisted in 
-    stable memory, but you need to load it into Orthogonal Persisted (working) memory, which is erased during a canister upgrade.
+  NOTE: In C++, files are stored in stable memory of the canister.
+        They will survive a code upgrade.
+  
+- Load the gguf file into Orthogonal Persisted (OP) working memory 
 
-    ```bash
-    dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/model.gguf";} })'
-    ```
+  ```bash
+  dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/model.gguf";} })'
+  ```
 
-  - Set the max_tokens for this model, to avoid it hits the IC's instruction limit
-    ```
-    dfx canister call llama_cpp set_max_tokens '(record { max_tokens_query = 10 : nat64; max_tokens_update = 10 : nat64 })'
+- Set the max_tokens for this model, to avoid it hits the IC's instruction limit
+  ```
+  dfx canister call llama_cpp set_max_tokens '(record { max_tokens_query = 10 : nat64; max_tokens_update = 10 : nat64 })'
 
-    dfx canister call llama_cpp get_max_tokens
-    ```
+  dfx canister call llama_cpp get_max_tokens
+  ```
 
 - Chat with the LLM
 
@@ -202,14 +190,13 @@ WARNING: Currently, the canister can only be build on a `Mac` !
     ```
 
     Note: The sequence of update calls to the canister is required because the Internet Computer has a limitation
-    on the number of computations it allows per call. At the moment, only 10 tokens can be generated per call.
+    on the number of instructions it allows per call. For this model, 10 tokens can be generated per update call.
+
     This sequence of update calls is equivalent to using the [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) 
     repo directly and running the `llama-cli` locally, with the command:
     ```
-    ./llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512 -fa -ngl 80 
+    <path-to>/llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --prompt-cache prompt.cache --prompt-cache-all -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -n 512
     ```
-    In above command, the `-fa -ngl 80` arguments are useful only on GPU. We do not use them when calling the IC, because
-    the canister has a CPU only.
 
   - Retrieving saved chats
 
@@ -220,9 +207,8 @@ WARNING: Currently, the canister can only be build on a `Mac` !
     dfx canister call llama_cpp get_chats
     ```
 
-
-
-- You can download the `main.log` file from the canister with:
+TODO-615212: there is no longer a main.log file?
+- For debug purposes, you can download the `main.log` file from the canister with:
   ```
   python -m scripts.download --network local --canister llama_cpp --local-filename main.log main.log
   ```
diff --git a/dfx.json b/dfx.json
index dddeeb0..03999f0 100644
--- a/dfx.json
+++ b/dfx.json
@@ -5,16 +5,6 @@
       "type": "custom",
       "candid": "src/llama_cpp.did",
       "wasm": "build/llama_cpp.wasm"
-    },
-    "llm_0": {
-      "type": "custom",
-      "candid": "src/llama_cpp.did",
-      "wasm": "build/llama_cpp.wasm"
-    },
-    "llm_1": {
-      "type": "custom",
-      "candid": "src/llama_cpp.did",
-      "wasm": "build/llama_cpp.wasm"
     }
   },
   "defaults": {
diff --git a/dfx.multiple-llms.json b/dfx.multiple-llms.json
new file mode 100644
index 0000000..aab85df
--- /dev/null
+++ b/dfx.multiple-llms.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "canisters": {
+    "llm_0": {
+      "type": "custom",
+      "candid": "src/llama_cpp.did",
+      "wasm": "build/llama_cpp.wasm"
+    },
+    "llm_1": {
+      "type": "custom",
+      "candid": "src/llama_cpp.did",
+      "wasm": "build/llama_cpp.wasm"
+    }
+  },
+  "defaults": {
+    "build": {
+      "args": "",
+      "packtool": ""
+    }
+  }
+}
\ No newline at end of file
diff --git a/scripts/3-upload-model.sh b/scripts/3-upload-model.sh
index 5787ab7..6a6dc25 100755
--- a/scripts/3-upload-model.sh
+++ b/scripts/3-upload-model.sh
@@ -16,8 +16,9 @@ NUM_LLMS_DEPLOYED=1
 # MODEL="models/stories260Ktok512.gguf"
 # MODEL="models/stories15Mtok4096.gguf"
 # MODEL="models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf"
-MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
-# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
+# MODEL="models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
+MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
+# MODEL="models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q2_K.gguf"
 
 # Parse command line arguments for network type
 while [ $# -gt 0 ]; do
diff --git a/scripts/4-load-model.sh b/scripts/4-load-model.sh
index 67cd9d0..0514dde 100755
--- a/scripts/4-load-model.sh
+++ b/scripts/4-load-model.sh
@@ -58,7 +58,7 @@ do
     echo "--------------------------------------------------"
     echo "Calling load_model for llm_$i"
     output=$(dfx canister call llm_$i load_model \
-            '(record { args = vec {"--model"; "models/model.gguf";} })' \
+            '(record { args = vec {"--model"; "models/model.gguf"; "--no-warmup";} })' \
             --network "$NETWORK_TYPE")
 
     if ! echo "$output" | grep -q " Ok "; then
diff --git a/scripts/5-set-max-tokens.sh b/scripts/5-set-max-tokens.sh
index f3ee2e1..b5fcc94 100755
--- a/scripts/5-set-max-tokens.sh
+++ b/scripts/5-set-max-tokens.sh
@@ -12,7 +12,8 @@ NUM_LLMS_DEPLOYED=1
 # MAX_TOKENS=128 # stories260Ktok512.gguf
 # MAX_TOKENS=60 # stories15Mtok4096.gguf
 # MAX_TOKENS=20 # SmolLM2-135M-Instruct-Q4_K_M.gguf
-MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf
+# MAX_TOKENS=10 # qwen2.5-0.5b-instruct-q8_0.gguf
+MAX_TOKENS=2 # DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf
 
 # Parse command line arguments for network type
 while [ $# -gt 0 ]; do
diff --git a/scripts/6-a-test-new-chat.sh b/scripts/6-new-chat.sh
similarity index 100%
rename from scripts/6-a-test-new-chat.sh
rename to scripts/6-new-chat.sh
diff --git a/scripts/7-deepseek-run-update-a.sh b/scripts/7-deepseek-run-update-a.sh
new file mode 100755
index 0000000..aee1a5b
--- /dev/null
+++ b/scripts/7-deepseek-run-update-a.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/test.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+i=0 # llm_$i will be tested
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+echo " "
+echo "--------------------------------------------------"
+echo "Calling run_update for llm_$i"
+# See model card at : https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
+dfx canister call llm_$i run_update '(record { args = vec {"--cache-type-k"; "q8_0"; "--no-warmup"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<｜User｜>What is 1+1?<｜Assistant｜>";} })'
diff --git a/scripts/7-deepseek-run-update-b.sh b/scripts/7-deepseek-run-update-b.sh
new file mode 100755
index 0000000..d50f47e
--- /dev/null
+++ b/scripts/7-deepseek-run-update-b.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#######################################################################
+# run from parent folder as:
+# scripts/test.sh --network [local|ic]
+#######################################################################
+
+# Default network type is local
+NETWORK_TYPE="local"
+i=0 # llm_$i will be tested
+
+# Parse command line arguments for network type
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --network)
+            shift
+            if [ "$1" = "local" ] || [ "$1" = "ic" ]; then
+                NETWORK_TYPE=$1
+            else
+                echo "Invalid network type: $1. Use 'local' or 'ic'."
+                exit 1
+            fi
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --network [local|ic]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Using network type: $NETWORK_TYPE"
+
+echo " "
+echo "--------------------------------------------------"
+echo "Calling run_update for llm_$i"
+dfx canister call llm_$i run_update '(record { args = vec {"--cache-type-k"; "q8_0"; "--no-warmup"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "";} })'
diff --git a/scripts/6-b-test-run-update.sh b/scripts/7-qwen-run-update-a.sh
similarity index 100%
rename from scripts/6-b-test-run-update.sh
rename to scripts/7-qwen-run-update-a.sh
diff --git a/scripts/6-c-test-run-update.sh b/scripts/7-qwen-run-update-b.sh
similarity index 100%
rename from scripts/6-c-test-run-update.sh
rename to scripts/7-qwen-run-update-b.sh
diff --git a/scripts/prompt-design.ipynb b/scripts/prompt-design.ipynb
index 44adc41..35c3e2d 100644
--- a/scripts/prompt-design.ipynb
+++ b/scripts/prompt-design.ipynb
@@ -41,7 +41,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 2,
+            "execution_count": 6,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -70,18 +70,19 @@
             "outputs": [],
             "source": [
                 "# Define where the llama-cli is located, relative to this notebook\n",
-                "LLAMA_CLI_PATH = \"../../ggerganov_llama_b841d0.cpp/llama-cli\" # Current llama_cpp_canister version\n",
-                "# LLAMA_CLI_PATH = \"../../ggerganov_llama_latest.cpp/build/bin/llama-cli\"\n",
+                "# LLAMA_CLI_PATH = \"../../ggerganov_llama_b841d0.cpp/llama-cli\" # Current llama_cpp_canister version\n",
+                "LLAMA_CLI_PATH = \"../../ggerganov_llama_latest.cpp/build/bin/llama-cli\"\n",
                 "\n",
                 "# Select a model to use\n",
-                "MODEL = \"../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf\"\n",
+                "# MODEL = \"../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf\"\n",
                 "# MODEL = \"../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q8_0.gguf\"\n",
                 "# MODEL = (\n",
                 "#     \"../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q4_K_M.gguf\"\n",
                 "# )\n",
-                "# MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf\"\n",
+                "MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf\"\n",
                 "# MODEL = \"../models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q2_K.gguf\"\n",
                 "\n",
+                "\n",
                 "def run_llama_cpp(\n",
                 "    prompt,\n",
                 "    num_tokens,\n",
@@ -101,9 +102,11 @@
                 "        LLAMA_CLI_PATH,\n",
                 "        \"-m\",\n",
                 "        MODEL,\n",
+                "        \"--no-warmup\",\n",
+                "        \"-no-cnv\",\n",
                 "        # \"--simple-io\",\n",
-                "        \"--no-display-prompt\",  # only return the generated text, without special characters\n",
-                "        # \"-sp\", # output special tokens\n",
+                "        # \"--no-display-prompt\",  # only return the generated text, without special characters\n",
+                "        \"-sp\",  # output special tokens\n",
                 "        \"-n\",\n",
                 "        f\"{num_tokens}\",\n",
                 "        \"--seed\",\n",
@@ -133,7 +136,7 @@
                 "    # Print the command on a single line for terminal use, preserving \\n\n",
                 "    print(\n",
                 "        \"\\nCommand:\\n\",\n",
-                "        f\"{LLAMA_CLI_PATH} -m {MODEL} --no-display-prompt -n {num_tokens} --seed {seed} --temp {temp} -p '{prompt}'\".replace(\n",
+                "        f\"{LLAMA_CLI_PATH} -m {MODEL} --no-warmup -no-cnv -sp -n {num_tokens} --seed {seed} --temp {temp} -p '{prompt}'\".replace(\n",
                 "            \"\\n\", \"\\\\n\"\n",
                 "        ),\n",
                 "    )\n",
@@ -158,7 +161,7 @@
                 "# mirostat_lr = 0.1\n",
                 "# mirostat_ent = 5.0\n",
                 "\n",
-                "prompt = f\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\ngive me a short introduction to LLMs.<|im_end|>\\n<|im_start|>assistant\\n\"\n",
+                "prompt = f\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\nWhat is the Proof-of-AI-Work Protocol?<|im_end|>\\n<|im_start|>assistant\\n\"\n",
                 "response = run_llama_cpp(\n",
                 "    prompt,\n",
                 "    num_tokens,\n",
diff --git a/test/test_canister_functions.py b/test/test_canister_functions.py
index 23c6eee..009e7e7 100644
--- a/test/test_canister_functions.py
+++ b/test/test_canister_functions.py
@@ -60,46 +60,46 @@ def test__get_access_err(identity_anonymous: Dict[str, str], network: str) -> No
     expected_response = '(variant { Err = variant { Other = "Access Denied" } })'
     assert response == expected_response
 
-def test__set_access_0(network: str) -> None:
+def test__set_access_1(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
         canister_method="set_access",
-        canister_argument='(record { level = 0 : nat16 })',
+        canister_argument='(record { level = 1 : nat16 })',
         network=network,
     )
-    expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })'
+    expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })'
     assert response == expected_response
 
-def test__get_access_0(network: str) -> None:
+def test__get_access_1(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
         canister_method="get_access",
-        canister_argument='(record { level = 0 : nat16 })',
+        canister_argument='(record { level = 1 : nat16 })',
         network=network,
     )
-    expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })'
+    expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })'
     assert response == expected_response
 
-def test__set_access_1(network: str) -> None:
+def test__set_access_0(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
         canister_method="set_access",
-        canister_argument='(record { level = 1 : nat16 })',
+        canister_argument='(record { level = 0 : nat16 })',
         network=network,
     )
-    expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })'
+    expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })'
     assert response == expected_response
 
-def test__get_access_1(network: str) -> None:
+def test__get_access_0(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
         canister_method="get_access",
-        canister_argument='(record { level = 1 : nat16 })',
+        canister_argument='(record { level = 0 : nat16 })',
         network=network,
     )
-    expected_response = '(variant { Ok = record { explanation = "All except anonymous"; level = 1 : nat16;} })'
-    assert response == expected_response
\ No newline at end of file
+    expected_response = '(variant { Ok = record { explanation = "Only controllers"; level = 0 : nat16;} })'
+    assert response == expected_response

From ba58d3bfcda1a133695badaf97c4a6e1c5f1104f Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Wed, 29 Jan 2025 16:01:30 -0500
Subject: [PATCH 15/25] Do not call load_model from upload.py

It is better to separate this out into another step.
---
 scripts/upload.py | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

diff --git a/scripts/upload.py b/scripts/upload.py
index de4e492..4bf1deb 100644
--- a/scripts/upload.py
+++ b/scripts/upload.py
@@ -65,8 +65,6 @@ def main() -> int:
 
     dfx_json_path = ROOT_PATH / "dfx.json"
 
-    uploading_gguf = local_filename_path.suffix.lower() == ".gguf"
-
     print(
         f"Summary:"
         f"\n - canister_filename   = {canister_filename}"
@@ -77,7 +75,6 @@ def main() -> int:
         f"\n - canister_id         = {canister_id}"
         f"\n - dfx_json_path       = {dfx_json_path}"
         f"\n - candid_path         = {candid_path}"
-        f"\n - uploading_gguf      = {uploading_gguf}"
     )
 
     # ---------------------------------------------------------------------------
@@ -158,39 +155,7 @@ def main() -> int:
 
         offset += len(chunk)
 
-    # ---------------------------------------------------------------------------
-    # Do something special if we're uploading a llama_cpp_canister model (gguf)
-    if uploading_gguf:
-        # load the model inside the canister into Orthogonal Persisted memory
-        print(
-            "--\nInstruct canister to load the model, getting it ready for inference."
-        )
-        response = canister_instance.load_model(
-            {"args": ["--model", canister_filename]}
-        )
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            print(response)
-            sys.exit(1)
-
-        # ---------------------------------------------------------------------------
-        # check readiness for inference
-        print("--\nChecking if the canister is ready for inference.")
-        response = canister_instance.ready()
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            print(response)
-            sys.exit(1)
-
-        print(f"--\nCongratulations, canister {canister_name} is ready for inference!")
-    else:
-        print(f"--\nCongratulations, the file {local_filename_path} was uploaded!")
+    print(f"--\nCongratulations, the file {local_filename_path} was uploaded!")
 
     try:
         print("💯 🎉 🏁")

From b7bd4cb59075347dce0d7df577064e16014d8b67 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Wed, 29 Jan 2025 16:02:56 -0500
Subject: [PATCH 16/25] Update comment

---
 src/main_.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/main_.cpp b/src/main_.cpp
index 45815d7..21495d0 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -1128,10 +1128,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     common_sampler_free(smpl);
 
     // ICPP-PATCH-START
-
-    // TODO-615212 -- Make sure this is correct
-    //                LEAVE IT IN
-    // Do reset all other static memory
+    // Reset all static memory we do not want to carry over to the next update call
     reset_static_memory();
     // ICPP-PATCH-END
 

From bf6141d14da6088b6eb8a87d2c21e1892f2699a4 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Fri, 31 Jan 2025 15:56:05 -0500
Subject: [PATCH 17/25] For clarity, dfx.json uses the .did file in 'build'
 folder

---
 dfx.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dfx.json b/dfx.json
index 03999f0..a6a11ef 100644
--- a/dfx.json
+++ b/dfx.json
@@ -3,7 +3,7 @@
   "canisters": {
     "llama_cpp": {
       "type": "custom",
-      "candid": "src/llama_cpp.did",
+      "candid": "build/llama_cpp.did",
       "wasm": "build/llama_cpp.wasm"
     }
   },

From 7ff8fa65c4c77105ef3880ae94a2a0ef33816965 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sat, 1 Feb 2025 09:48:10 -0500
Subject: [PATCH 18/25] remove_log_file

Logging changed in this version, and we need to provide mechanism to use remove log files.
---
 README-0002-615212.md        |  11 +++
 README.md                    |  27 +++++++-
 native/test_qwen2.cpp        |  11 +++
 native/test_tiny_stories.cpp | 126 +++++++++++++++++++++++++----------
 src/llama_cpp.did            |   1 +
 src/main_.cpp                |   3 +
 src/model.cpp                |   1 +
 src/run.cpp                  | 100 +++++++++++++++++++--------
 src/run.h                    |   1 +
 9 files changed, 218 insertions(+), 63 deletions(-)

diff --git a/README-0002-615212.md b/README-0002-615212.md
index ba98c79..1b7bf47 100644
--- a/README-0002-615212.md
+++ b/README-0002-615212.md
@@ -188,6 +188,17 @@ make build-info-cpp-wasm
 - outcomment `set_process_priority` function
 
 #### llama_cpp_onicai_fork/common/log.cpp
+- Add function `common_log_remove_file` to the public API
+  ```C++
+  // ICPP-PATCH-START
+  // We need to add a public function to remove the log file from the canister
+  void common_log_remove_file(struct common_log * log) {
+      log->remove_file();
+  }
+  // ICPP-PATCH-END
+  ```
+- Add public function `remove_file` to the struct common_log:
+
 - Remove all threading logic
   #include <mutex>
   #include <thread>
diff --git a/README.md b/README.md
index 5a3935f..b18ca50 100644
--- a/README.md
+++ b/README.md
@@ -207,10 +207,31 @@ The build of the wasm must be done on a `Mac` !
     dfx canister call llama_cpp get_chats
     ```
 
-TODO-615212: there is no longer a main.log file?
-- For debug purposes, you can download the `main.log` file from the canister with:
-  ```
+- For debug purposes, you can tell the canister to log to a file and download it afterwards:
+
+  ```bash
+  # Start a new chat - this resets the prompt-cache for this conversation
+  dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "prompt.cache"} })'
+
+  # Pass '"--log-file"; "main.log";' to the `run_update` calls: 
+  
+  # Repeat this call until `prompt_remaining` in the response is empty. 
+  # This ingest the prompt into the prompt-cache, using multiple update calls
+  # Important: KEEP SENDING THE FULL PROMPT 
+  dfx canister call llama_cpp run_update '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' 
+  ...
+
+  # Once `prompt_remaining` in the response is empty, repeat this call, with an empty prompt, until `generated_eog=true`
+  # Now the LLM is generating new tokens !
+  dfx canister call llama_cpp run_update '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })'
+
+
+  # Download the `main.log` file from the canister:
   python -m scripts.download --network local --canister llama_cpp --local-filename main.log main.log
+
+  # Cleanup, by deleting both the log & prompt.cache files in the canister:
+  dfx canister call llama_cpp remove_prompt_cache '(record { args = vec {"--prompt-cache"; "prompt.cache"} })'
+  dfx canister call llama_cpp remove_log_file '(record { args = vec {"--log-file"; "main.log"} })'
   ```
 
 ## Smoke testing the deployed LLM
diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp
index aada78b..a7b99f2 100644
--- a/native/test_qwen2.cpp
+++ b/native/test_qwen2.cpp
@@ -161,4 +161,15 @@ void test_qwen2(MockIC &mockIC) {
       "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
       "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100850143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368652064656c65746564207375636365737366756c6c790000c8000000",
       silent_on_trap, my_principal);
+
+  // -----------------------------------------------------------------------------
+  // Remove the log-file file if it exists
+  // '(record { args = vec {"--log-file"; "main.log"} })' ->
+  // '(variant { Ok = record { status_code = 200 : nat16; output = "Cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/main.log deleted successfully"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+  mockIC.run_test(
+      std::string(__func__) + ": " + "remove_log_file " + model,
+      remove_prompt_cache,
+      "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67",
+      "4449444c026b01bc8a01016c06819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e010000810143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f6d61696e2e6c6f672064656c65746564207375636365737366756c6c790000c8000000",
+      silent_on_trap, my_principal);
 }
\ No newline at end of file
diff --git a/native/test_tiny_stories.cpp b/native/test_tiny_stories.cpp
index 90cb12d..5e3408b 100644
--- a/native/test_tiny_stories.cpp
+++ b/native/test_tiny_stories.cpp
@@ -93,40 +93,80 @@ void test_tiny_stories(MockIC &mockIC) {
 
     // Let's have two chats with this model
     for (int i = 0; i < 2; ++i) {
-      // -----------------------------------------------------------------------------
-      // Start a new chat, which will remove the prompt-cache file if it exists
-      // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' ->
-      // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
-      mockIC.run_test(
-          std::string(__func__) + ": " + "new_chat " + std::to_string(i) +
-              " - " + model,
-          new_chat,
-          "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
-          "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000",
-          silent_on_trap, my_principal);
-
-      // -----------------------------------------------------------------------------
-      // Generate tokens from prompt while saving everything to cache,
-      // without re-reading the model !
-      // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })'
-      // -> ...
-      mockIC.run_test(
-          std::string(__func__) + ": " + "run_update for chat " +
-              std::to_string(i) + " - " + model,
-          run_update,
-          "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573",
-          "", silent_on_trap, my_principal);
-
-      // -----------------------------------------------------------------------------
-      // Continue generating tokens while using & saving the cache, without re-reading the model
-      // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' ->
-      // -> ...
-      mockIC.run_test(
-          std::string(__func__) + ": " + "run_update for chat " +
-              std::to_string(i) + " continued - " + model,
-          run_update,
-          "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000",
-          "", silent_on_trap, my_principal);
+      if (i == 0) {
+        // -----------------------------------------------------------------------------
+        // Without log file
+        // Start a new chat, which will reset the prompt-cache file
+        // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' ->
+        // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+        mockIC.run_test(
+            std::string(__func__) + ": " + "new_chat " + std::to_string(i) +
+                " - " + model,
+            new_chat,
+            "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
+            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000",
+            silent_on_trap, my_principal);
+
+        // -----------------------------------------------------------------------------
+        // Generate tokens from prompt while saving everything to cache,
+        // without re-reading the model !
+        // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })'
+        // -> ...
+        mockIC.run_test(
+            std::string(__func__) + ": " + "run_update for chat " +
+                std::to_string(i) + " - " + model,
+            run_update,
+            "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573",
+            "", silent_on_trap, my_principal);
+
+        // -----------------------------------------------------------------------------
+        // Continue generating tokens while using & saving the cache, without re-reading the model
+        // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' ->
+        // -> ...
+        mockIC.run_test(
+            std::string(__func__) + ": " + "run_update for chat " +
+                std::to_string(i) + " continued - " + model,
+            run_update,
+            "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000",
+            "", silent_on_trap, my_principal);
+
+      } else {
+        // -----------------------------------------------------------------------------
+        // With log file
+        // Start a new chat, which will reset both the prompt-cache and log-file files
+        // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"} })' ->
+        // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+        mockIC.run_test(
+            std::string(__func__) + ": " + "new_chat " + std::to_string(i) +
+                " - " + model,
+            new_chat,
+            "4449444c026c01dd9ad28304016d710100040a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
+            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000",
+            silent_on_trap, my_principal);
+
+        // -----------------------------------------------------------------------------
+        // Generate tokens from prompt while saving everything to cache,
+        // without re-reading the model !
+        // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })'
+        // -> ...
+        mockIC.run_test(
+            std::string(__func__) + ": " + "run_update for chat " +
+                std::to_string(i) + " - " + model,
+            run_update,
+            "4449444c026c01dd9ad28304016d7101000f0a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573",
+            "", silent_on_trap, my_principal);
+
+        // -----------------------------------------------------------------------------
+        // Continue generating tokens while using & saving the cache, without re-reading the model
+        // '(record { args = vec {"--log-file"; "main.log"; "--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' ->
+        // -> ...
+        mockIC.run_test(
+            std::string(__func__) + ": " + "run_update for chat " +
+                std::to_string(i) + " continued - " + model,
+            run_update,
+            "4449444c026c01dd9ad28304016d7101000f0a2d2d6c6f672d66696c65086d61696e2e6c6f670e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000",
+            "", silent_on_trap, my_principal);
+      }
 
       // -----------------------------------------------------------------------------
       // Remove the prompt-cache file if it exists
@@ -139,6 +179,24 @@ void test_tiny_stories(MockIC &mockIC) {
           "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
           "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100850143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368652064656c65746564207375636365737366756c6c790000c8000000",
           silent_on_trap, my_principal);
+
+      // -----------------------------------------------------------------------------
+      // Remove the log-file file if it exists
+      // '(record { args = vec {"--log-file"; "main.log"} })' -> response
+      std::string response;
+      if (i == 1) {
+        // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+        response =
+            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000";
+      } else {
+        response =
+            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000";
+      }
+      mockIC.run_test(
+          std::string(__func__) + ": " + "remove_log_file " + model,
+          remove_log_file,
+          "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67",
+          response, silent_on_trap, my_principal);
     }
   }
 }
\ No newline at end of file
diff --git a/src/llama_cpp.did b/src/llama_cpp.did
index 6c14fef..6d240f9 100644
--- a/src/llama_cpp.did
+++ b/src/llama_cpp.did
@@ -120,6 +120,7 @@ service : {
   run_query : (InputRecord) -> (OutputRecordResult) query;
   run_update : (InputRecord) -> (OutputRecordResult);
   remove_prompt_cache : (InputRecord) -> (OutputRecordResult);
+  remove_log_file : (InputRecord) -> (OutputRecordResult);
 
   // Chats retrieval
   get_chats : () -> (GetChatsRecordResult) query;
diff --git a/src/main_.cpp b/src/main_.cpp
index 21495d0..b61b793 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -1128,6 +1128,9 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only
     common_sampler_free(smpl);
 
     // ICPP-PATCH-START
+    // Close log file and reset pointers, so next call will start fresh, with or without logging
+    common_log_set_file(common_log_main(), nullptr);
+
     // Reset all static memory we do not want to carry over to the next update call
     reset_static_memory();
     // ICPP-PATCH-END
diff --git a/src/model.cpp b/src/model.cpp
index ddc9ec1..27ecee3 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -11,6 +11,7 @@
 
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 
 #include <iostream>
 #include <string>
diff --git a/src/run.cpp b/src/run.cpp
index 79fa8d8..2183837 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -8,6 +8,7 @@
 #include "utils.h"
 
 #include "arg.h"
+#include "log.h"
 
 #include <filesystem>
 #include <iostream>
@@ -45,33 +46,38 @@ void new_chat() {
   CandidTypePrincipal caller = ic_api.get_caller();
   std::string principal_id = caller.get_text();
 
-  auto [argc, argv, args] = get_args_for_main(ic_api);
-
-  // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation
-  // Get the cache filename from --prompt-cache in args
-  common_params params;
-  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
-                           print_usage)) {
-    error_msg = "Cannot parse args.";
+  // -----------------------------------------------------------
+  // Create a new file to save this chat for this prinicipal
+  if (!db_chats_new(principal_id, error_msg)) {
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);
     return;
   }
 
-  // Create a new file to save this chat for this prinicipal
-  if (!db_chats_new(principal_id, error_msg)) {
+  // Each principal can only save N chats
+  if (!db_chats_clean(principal_id, error_msg)) {
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);
     return;
   }
 
-  // Each principal can only save N chats
-  if (!db_chats_clean(principal_id, error_msg)) {
+  // -----------------------------------------------------------
+  // Parse the arguments
+  auto [argc, argv, args] = get_args_for_main(ic_api);
+
+  // (-) gets the cache filename from --prompt-cache in args
+  // (-) opens log file from --log-file in args
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
+    error_msg = "Cannot parse args.";
     send_output_record_result_error_to_wire(
         ic_api, Http::StatusCode::InternalServerError, error_msg);
     return;
   }
 
+  // -----------------------------------------------------------
+  // Create/reset a prompt-cache file to zero length, will reset the LLM state for that conversation
   // Each principal has their own cache folder
   std::string path_session = params.path_prompt_cache;
   std::string canister_path_session;
@@ -111,6 +117,20 @@ void new_chat() {
   // Simpler message back to the wire
   msg = "Ready to start a new chat for cache file " + path_session;
 
+  // -----------------------------------------------------------
+  // If --log-file is provided, the file was opened by common_params_parse
+  // Was it already closed, and common_log_main() does not work anymore???
+  // If so, then store --log-file value in params.log_file, and delete it here
+  // If not, then get the file handle from common_log_main() and empty the file
+  //
+  // When running native, the log file is only closed at the end...
+  //                                   it is opened multiple times. Does that work OK ?
+
+  // When running in the IC, the log file is ????
+
+  std::cout << "TODO";
+
+  // -----------------------------------------------------------
   // Return output over the wire
   CandidTypeRecord r_out;
   r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200
@@ -147,20 +167,6 @@ void remove_prompt_cache() {
     return;
   }
 
-  // // Create a new file to save this chat for this prinicipal
-  // if (!db_chats_new(principal_id, error_msg)) {
-  //   send_output_record_result_error_to_wire(
-  //       ic_api, Http::StatusCode::InternalServerError, error_msg);
-  //   return;
-  // }
-
-  // // Each principal can only save N chats
-  // if (!db_chats_clean(principal_id, error_msg)) {
-  //   send_output_record_result_error_to_wire(
-  //       ic_api, Http::StatusCode::InternalServerError, error_msg);
-  //   return;
-  // }
-
   // Each principal has their own cache folder
   std::string path_session = params.path_prompt_cache;
   std::string canister_path_session;
@@ -207,6 +213,48 @@ void remove_prompt_cache() {
   ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
 }
 
+void remove_log_file() {
+  IC_API ic_api(CanisterUpdate{std::string(__func__)}, false);
+  std::string error_msg;
+  if (!is_caller_whitelisted(ic_api, false)) {
+    error_msg = "Access Denied.";
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::Unauthorized, error_msg);
+    return;
+  }
+
+  auto [argc, argv, args] = get_args_for_main(ic_api);
+
+  // Process the args, which will instantiate the log singleton
+  common_params params;
+  if (!common_params_parse(argc, argv.data(), params, LLAMA_EXAMPLE_MAIN,
+                           print_usage)) {
+    error_msg = "Cannot parse args.";
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, error_msg);
+    return;
+  }
+
+  // Now we can remove the log file
+  std::string msg;
+  bool success = common_log_remove_file(common_log_main(), msg);
+  if (!success) {
+    send_output_record_result_error_to_wire(
+        ic_api, Http::StatusCode::InternalServerError, msg);
+    return;
+  }
+
+  // Return output over the wire
+  CandidTypeRecord r_out;
+  r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200
+  r_out.append("conversation", CandidTypeText{""});
+  r_out.append("output", CandidTypeText{msg});
+  r_out.append("error", CandidTypeText{""});
+  r_out.append("prompt_remaining", CandidTypeText{""});
+  r_out.append("generated_eog", CandidTypeBool{false});
+  ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
+}
+
 void run(IC_API &ic_api, const uint64_t &max_tokens) {
   std::string error_msg;
   if (!is_caller_whitelisted(ic_api, false)) {
diff --git a/src/run.h b/src/run.h
index 331ff00..9574865 100644
--- a/src/run.h
+++ b/src/run.h
@@ -8,6 +8,7 @@ void run_query() WASM_SYMBOL_EXPORTED("canister_query run_query");
 void run_update() WASM_SYMBOL_EXPORTED("canister_update run_update");
 void remove_prompt_cache()
     WASM_SYMBOL_EXPORTED("canister_update remove_prompt_cache");
+void remove_log_file() WASM_SYMBOL_EXPORTED("canister_update remove_log_file");
 
 bool get_canister_path_session(const std::string &path_session,
                                const std::string &principal_id,

From 9c0fc1f3057d8acc618d187b6e40aa74c5c15077 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sat, 1 Feb 2025 09:57:17 -0500
Subject: [PATCH 19/25] Update native & pytests

---
 native/test_qwen2.cpp        |  6 +++---
 native/test_tiny_stories.cpp | 15 ++++-----------
 test/test_qwen2.py           | 10 ++++++++++
 test/test_tiny_stories.py    | 10 ++++++++++
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp
index a7b99f2..d48dc23 100644
--- a/native/test_qwen2.cpp
+++ b/native/test_qwen2.cpp
@@ -165,11 +165,11 @@ void test_qwen2(MockIC &mockIC) {
   // -----------------------------------------------------------------------------
   // Remove the log-file file if it exists
   // '(record { args = vec {"--log-file"; "main.log"} })' ->
-  // '(variant { Ok = record { status_code = 200 : nat16; output = "Cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/main.log deleted successfully"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+  // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
   mockIC.run_test(
       std::string(__func__) + ": " + "remove_log_file " + model,
-      remove_prompt_cache,
+      remove_log_file,
       "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67",
-      "4449444c026b01bc8a01016c06819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e010000810143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f6d61696e2e6c6f672064656c65746564207375636365737366756c6c790000c8000000",
+      "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000",
       silent_on_trap, my_principal);
 }
\ No newline at end of file
diff --git a/native/test_tiny_stories.cpp b/native/test_tiny_stories.cpp
index 5e3408b..528a1af 100644
--- a/native/test_tiny_stories.cpp
+++ b/native/test_tiny_stories.cpp
@@ -182,21 +182,14 @@ void test_tiny_stories(MockIC &mockIC) {
 
       // -----------------------------------------------------------------------------
       // Remove the log-file file if it exists
-      // '(record { args = vec {"--log-file"; "main.log"} })' -> response
-      std::string response;
-      if (i == 1) {
-        // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
-        response =
-            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000";
-      } else {
-        response =
-            "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000";
-      }
+      // '(record { args = vec {"--log-file"; "main.log"} })' ->
+      // '(variant { Ok = record { status_code = 200 : nat16; output = "Successfully removed log file: main.log"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
       mockIC.run_test(
           std::string(__func__) + ": " + "remove_log_file " + model,
           remove_log_file,
           "4449444c026c01dd9ad28304016d710100020a2d2d6c6f672d66696c65086d61696e2e6c6f67",
-          response, silent_on_trap, my_principal);
+          "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100275375636365737366756c6c792072656d6f766564206c6f672066696c653a206d61696e2e6c6f670000c8000000",
+          silent_on_trap, my_principal);
     }
   }
 }
\ No newline at end of file
diff --git a/test/test_qwen2.py b/test/test_qwen2.py
index 280516e..d593a24 100644
--- a/test/test_qwen2.py
+++ b/test/test_qwen2.py
@@ -121,4 +121,14 @@ def test__remove_prompt_cache(network: str) -> None:
         canister_argument='(record { args = vec {"--prompt-cache"; "prompt.cache"} })',
         network=network,
     )
+    assert "(variant { Ok" in response
+
+def test__remove_log_file(network: str) -> None:
+    response = call_canister_api(
+        dfx_json_path=DFX_JSON_PATH,
+        canister_name=CANISTER_NAME,
+        canister_method="remove_log_file",
+        canister_argument='(record { args = vec {"--log-file"; "main.log"} })',
+        network=network,
+    )
     assert "(variant { Ok" in response
\ No newline at end of file
diff --git a/test/test_tiny_stories.py b/test/test_tiny_stories.py
index 838fa21..c17fbc6 100644
--- a/test/test_tiny_stories.py
+++ b/test/test_tiny_stories.py
@@ -141,4 +141,14 @@ def test__remove_prompt_cache(network: str) -> None:
         canister_argument='(record { args = vec {"--prompt-cache"; "prompt.cache"} })',
         network=network,
     )
+    assert "(variant { Ok" in response
+
+def test__remove_log_file(network: str) -> None:
+    response = call_canister_api(
+        dfx_json_path=DFX_JSON_PATH,
+        canister_name=CANISTER_NAME,
+        canister_method="remove_log_file",
+        canister_argument='(record { args = vec {"--log-file"; "main.log"} })',
+        network=network,
+    )
     assert "(variant { Ok" in response
\ No newline at end of file

From 4f98e55fa7609deb5840d6d1aa9d13f70d9e966e Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sat, 1 Feb 2025 10:00:43 -0500
Subject: [PATCH 20/25] CI/CD - use different branch while working on upgrade

---
 .github/workflows/cicd-mac.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-mac.yml b/.github/workflows/cicd-mac.yml
index 068c29e..8fc7bc8 100644
--- a/.github/workflows/cicd-mac.yml
+++ b/.github/workflows/cicd-mac.yml
@@ -39,7 +39,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: onicai/llama_cpp_onicai_fork
-          ref: onicai  # Specify the branch name here
+          # ref: onicai  # Specify the branch name here
+          ref: onicai-615212  # While working on the upgrade...
           path: src/llama_cpp_onicai_fork
           fetch-depth: 1 # Get just the last commit
           submodules: 'recursive'

From 0280511e7d168f5e8a94b95da23ef97ff668a964 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sat, 1 Feb 2025 13:17:30 -0500
Subject: [PATCH 21/25] format include

---
 src/main_.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main_.cpp b/src/main_.cpp
index b61b793..bc7ebca 100644
--- a/src/main_.cpp
+++ b/src/main_.cpp
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <format>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>

From 30a6b8cc838e56b7026e480dab289d96dff2f103 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sun, 2 Feb 2025 09:07:08 -0500
Subject: [PATCH 22/25] Update READMEs

---
 README-0002-615212.md        | 40 ++----------------------------------
 README-contributors-guide.md | 10 ++++-----
 README.md                    |  3 ++-
 3 files changed, 9 insertions(+), 44 deletions(-)

diff --git a/README-0002-615212.md b/README-0002-615212.md
index 1b7bf47..2e352a1 100644
--- a/README-0002-615212.md
+++ b/README-0002-615212.md
@@ -326,45 +326,13 @@ No updates needed for icpp-pro
 - #define GGML_DEFAULT_N_THREADS  1
 
 ------------
-TODO: search in code files for: TODO-615212
-
-(-) main_.cpp has a new static `global g_smpl`:
-    static common_sampler          ** g_smpl;
-
-    Q: Does this need to become a global variable, accessible from common.cpp ?
-       Like we did for g_model ?
-
-       In `common/common.cpp` we added:    
-        ```
-        // ICPP-PATCH-START
-        #include "ic_api.h"
-        extern llama_model ** g_model; // The global variable from main_.cpp
-        // ICPP-PATCH-END
-        ```
-
-(-) main_.cpp renamed type for `g_params`:
-    from: static gpt_params               * g_params;
-    to  : static common_params            * g_params;
-
-    Q: Does this need to become a global variable, accessible from common.cpp ?
-       Like we did for g_model ?
-
-(-) main_.cpp line 142: common_sampler * smpl = nullptr;
-
-    Q: Does `smpl` need to become a static variable, like `model` & `ctx` ?
-
-(-) main_.cpp line 147: // Don't give error if embd_inp = session_tokens. All is OK to just keep going
-
-    Q: Is this logic for prompt_remaining still valid?
+TODOs:
 
 (-) LOG & LOG_TEE have been replaced by LOG, LOG_ERR, LOG_WRN, LOG_INF, LOG_CNT
     -> LOG is used just for Console/Stream Output
     -> LOG_xxx is used for ERR, WRN, INF, CNT --> Not sure yet where this goes...
 
-    Q1: Did we change anything to LOG & LOG_TEE to get it to work ?
-    Q2: Are we still using LOG & LOG_TEE ourselvs? If so, replace it.
-    Q3: Can we remove the LOG & LOG_TEE 
-    Q4: Do we need to update the README about downloading different LOG files?
+    Q4: Update the README about downloading different LOG files?
 
 (-) llama-vocab.cpp  --- This function is no longer there. Is tinystories still working?
 
@@ -379,10 +347,6 @@ TODO: search in code files for: TODO-615212
       }
       ```
 
-(-) TODO: `llama_cpp_onicai_fork/common/log.cpp` step through the logic
-          - Remove the pause() function
-          - Remove the cur.is_end function ?
-
 (-) TODO: Monitor memory, and make sure that ctx is freed up...
     See free_ctx() method that has been outcommented in main_.cpp
     
diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 16415b0..504ef1d 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -34,11 +34,11 @@ Take following steps locally:
 
 - These are the git-sha values of the llama.cpp versions we branched from:
 
-  | upgrade # | llama.cpp sha | llama.cpp release-tag |
-  | --------- | ------------- | --------------------- |
-  |    0000   |     5cdb37    |         -             |
-  |    0001   |     b841d0    |         -             |
-  |    0002   |     615212    |         b4532         |
+  | upgrade # | llama.cpp sha | llama.cpp release-tag |    date    |
+  | --------- | ------------- | --------------------- | ---------- |
+  |    0000   |     5cdb37    |         -             |    -       |
+  |    0001   |     b841d0    |         -             |    -       |
+  |    0002   |     615212    |         b4532         | Feb  2 '25 |
 
 
 - Start with a fresh clone of llama_cpp_onicai_fork:
diff --git a/README.md b/README.md
index b18ca50..640914f 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,8 @@ The build of the wasm must be done on a `Mac` !
 - Upload gguf file
 
   The canister is now up & running, and ready to be loaded with a gguf file. In
-  this example we use the powerful `qwen2.5-0.5b-instruct-q8_0.gguf` model.
+  this example we use the powerful `qwen2.5-0.5b-instruct-q8_0.gguf` model, but
+  you can use any model availabe in gguf format. 
 
   - Download the model from huggingface: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF
 

From 81e00f04c1506a385c53da79f54f07981740fd55 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sun, 2 Feb 2025 09:14:43 -0500
Subject: [PATCH 23/25] Update table of llama.cpp upgrades

---
 README-contributors-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 504ef1d..3cbed06 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -36,9 +36,9 @@ Take following steps locally:
 
   | upgrade # | llama.cpp sha | llama.cpp release-tag |    date    |
   | --------- | ------------- | --------------------- | ---------- |
-  |    0000   |     5cdb37    |         -             |    -       |
-  |    0001   |     b841d0    |         -             |    -       |
   |    0002   |     615212    |         b4532         | Feb  2 '25 |
+  |    0001   |     b841d0    |         -             | Oct 18 '24 |
+  |    0000   |     5cdb37    |         -             | Jul 21 '24 |
 
 
 - Start with a fresh clone of llama_cpp_onicai_fork:

From 5228f7682340b76c4e40a9deae4cec9ded262730 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sun, 2 Feb 2025 09:28:43 -0500
Subject: [PATCH 24/25] Running LLMs on-chain solves your cybersecurity problem

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 640914f..3c35809 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,11 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
 
-`llama_cpp_canister` allows you to deploy [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) as a Smart Contract on the Internet Computer.
+`llama_cpp_canister` allows you to deploy [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) as a Smart Contract on the Internet Computer,
+and run an LLM on-chain as the brain for your on-chain AI Agents.
 
+- Run any LLM on-chain via the gguf format 🔥
+- Solves your cybersecurity problem 🔐
 - MIT open source 🧑‍💻
 - Well documented 📝
 - Fully QA'd via CI/CD ✅
@@ -16,15 +19,12 @@
 
 # Try it out
 
-You can try out a deployed version at https://icgpt.onicai.com
+You can try out a variety of fully on-chain LLMs at https://icgpt.onicai.com
 
-# Need help?
+# Need help or have feedback? ❤️
 
-If you decide to use llama_cpp_canister in your ICP dApp, we want to help you.
-
-We do NOT consider llama_cpp_canister "our IP". It is for the broad benefit of DeAI on ICP, and we hope many of you will try it out and use it.
-
-Please join our [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aaaar-ar7uq-cai/?ref=6e3y2-4yaaa-aaaaf-araya-cai) for any questions, discussions or feedback. ❤️
+- [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aaaar-ar7uq-cai/?ref=6e3y2-4yaaa-aaaaf-araya-cai) 
+- [Forum: Llama.cpp on the Internet Computer](https://forum.dfinity.org/t/llama-cpp-on-the-internet-computer/33471?u=icpp)
 
 # Capabilities 🔥
 

From 3ff73fafbe720d882eb76aba782bb599a9dd6d47 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Sun, 2 Feb 2025 09:59:30 -0500
Subject: [PATCH 25/25] Upgrade to llama.cpp sha 615212

All done...
---
 .github/workflows/cicd-mac.yml | 4 ++--
 README-contributors-guide.md   | 7 ++++++-
 README.md                      | 6 +++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cicd-mac.yml b/.github/workflows/cicd-mac.yml
index 8fc7bc8..c383315 100644
--- a/.github/workflows/cicd-mac.yml
+++ b/.github/workflows/cicd-mac.yml
@@ -39,8 +39,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: onicai/llama_cpp_onicai_fork
-          # ref: onicai  # Specify the branch name here
-          ref: onicai-615212  # While working on the upgrade...
+          ref: onicai  # Specify the branch name here
+          # ref: onicai-615212  # While working on the upgrade...
           path: src/llama_cpp_onicai_fork
           fetch-depth: 1 # Get just the last commit
           submodules: 'recursive'
diff --git a/README-contributors-guide.md b/README-contributors-guide.md
index 3cbed06..7c533b7 100644
--- a/README-contributors-guide.md
+++ b/README-contributors-guide.md
@@ -84,10 +84,15 @@ We need to rethink this logic, but for now it is ok...
 Do NOT merge the `onicai-<git-sha>` branch into the `onicai` branch, but replace it:
 
 ```
+# do the onicai branch management while master branch is checked out
+git checkout master
 git branch -m onicai onicai-<git-sha-old>
 git branch -m onicai-<git-sha-new> onicai
-git push origin onicai:onicai
+git push --force origin onicai:onicai
 git push origin onicai-<git-sha-old>:onicai-<git-sha-old>
+#
+# Switch to the onicai branch, which now contains the <git-sha-new> version
+git checkout onicai
 ```
 
 ## llama_cpp_canister
diff --git a/README.md b/README.md
index 3c35809..b5d0d50 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,12 @@ You can try out a variety of fully on-chain LLMs at https://icgpt.onicai.com
 # Capabilities 🔥
 
 - Deploy any LLM available as a gguf file.
-- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)).
 
+  *(The model must be able to produce at least 1 token per update call)*
+
+- Our largest so far is DeepSeek-R1 1.5B (See [X](https://x.com/onicaiHQ/status/1884339580851151089)).
+  
+  
 # Set up
 
 The build of the wasm must be done on a `Mac` !