nomic-ai · manyoso · May 21, 2023 · May 19, 2023 · May 19, 2023 · May 19, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,9 @@
-[submodule "llama.cpp"]
-	path = gpt4all-backend/llama.cpp
+[submodule "llama.cpp-230519"]
+	path = gpt4all-backend/llama.cpp-230519
+	url = https://github.com/ggerganov/llama.cpp.git
+[submodule "llama.cpp-230511"]
+	path = gpt4all-backend/llama.cpp-230511
 	url = https://github.com/manyoso/llama.cpp.git
+[submodule "llama.cpp-mainline"]
+	path = gpt4all-backend/llama.cpp-mainline
+	url = https://github.com/ggerganov/llama.cpp.git
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
@@ -54,7 +54,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
     set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})
 
     # Include GGML
-    include_ggml(llama.cpp -${BUILD_VARIANT} ON)
+    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
+    include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON)
+    include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
 
     # Function for preparing individual implementations
     function(prepare_target TARGET_NAME BASE_LIB)
@@ -71,18 +73,32 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
                      PROPERTY INTERPROCEDURAL_OPTIMIZATION ${IPO_SUPPORTED})
     endfunction()
 
-    # Add each individual implementation
-    add_library(llamamodel-${BUILD_VARIANT} SHARED
+    # Add each individual implementations
+    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
         llamamodel.cpp)
-    prepare_target(llamamodel llama)
+    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
+        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
+    prepare_target(llamamodel-mainline llama-mainline)
+
+    add_library(llamamodel-230519-${BUILD_VARIANT} SHARED
+        llamamodel.cpp)
+    target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE
+        LLAMA_VERSIONS===2 LLAMA_DATE=230519)
+    prepare_target(llamamodel-230519 llama-230519)
+
+    add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
+        llamamodel.cpp)
+    target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
+        LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
+    prepare_target(llamamodel-230511 llama-230511)
 
     add_library(gptj-${BUILD_VARIANT} SHARED
         gptj.cpp)
-    prepare_target(gptj ggml)
+    prepare_target(gptj ggml-230511)
 
     add_library(mpt-${BUILD_VARIANT} SHARED
         mpt.cpp)
-    prepare_target(mpt ggml)
+    prepare_target(mpt ggml-230511)
 endforeach()
 
 add_library(llmodel

diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
@@ -1,6 +1,5 @@
 #define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "gptj_impl.h"
-#include "llama.cpp/ggml.h"
 
 #include "utils.h"
 
@@ -26,6 +25,7 @@
 #endif
 #include <sstream>
 #include <unordered_set>
+#include <ggml.h>
 
 
 namespace {
@@ -1133,7 +1133,9 @@ const char *get_build_variant() {
     return GGML_BUILD_VARIANT;
 }
 
-bool magic_match(uint32_t magic) {
+bool magic_match(std::istream& f) {
+    uint32_t magic = 0;
+    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
     return magic == 0x67676d6c;
 }
 

diff --git a/gpt4all-backend/llama.cpp → gpt4all-backend/llama.cpp-230511 b/gpt4all-backend/llama.cpp → gpt4all-backend/llama.cpp-230511
diff --git a/gpt4all-backend/llama.cpp-230519 b/gpt4all-backend/llama.cpp-230519
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
@@ -332,10 +332,16 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
     endif()
 
     if (WITH_LLAMA)
+        # Backwards compatibility with old llama.cpp versions
+        set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
+        if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
+            set(LLAMA_UTIL_SOURCE_FILE llama_util.h)
+        endif()
+
         add_library(llama${SUFFIX}
                     ${DIRECTORY}/llama.cpp
                     ${DIRECTORY}/llama.h
-                    ${DIRECTORY}/llama_util.h)
+                    ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
 
         target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
         target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -28,14 +28,23 @@
 #include <llama.h>
 #include <ggml.h>
 
+
 namespace {
 const char *modelType_ = "LLaMA";
 }
 
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
+#if LLAMA_DATE <= 230511
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+#endif
+
+#if LLAMA_DATE >= 230519
+    // sampling parameters
+    float   tfs_z         = 1.0f; // 1.0 = disabled
+    float   typical_p     = 1.0f; // 1.0 = disabled
+#endif
 
     std::string prompt = "";
 
@@ -45,25 +54,45 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
 };
 
+#if LLAMA_DATE >= 230519
+static int llama_sample_top_p_top_k(
+        llama_context *ctx,
+        const llama_token *last_n_tokens_data,
+        int last_n_tokens_size,
+        int top_k,
+        float top_p,
+        float temp,
+        float repeat_penalty) {
+    auto logits = llama_get_logits(ctx);
+    auto n_vocab = llama_n_vocab(ctx);
+    // Populate initial list of all candidates
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (int token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+    // Sample repeat penalty
+    llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty);
+    // Temperature sampling
+    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+    llama_sample_temperature(ctx, &candidates_p, temp);
+    return llama_sample_token(ctx, &candidates_p);
+}
+#endif
+
 struct LLamaPrivate {
     const std::string modelPath;
     bool modelLoaded;
     llama_context *ctx = nullptr;
     llama_context_params params;
     int64_t n_threads = 0;
+    bool empty = true;
 };
 
-
-static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
 LLamaModel::LLamaModel()
     : d_ptr(new LLamaPrivate) {
     modelType = modelType_;
@@ -78,11 +107,13 @@ bool LLamaModel::loadModel(const std::string &modelPath)
 
     gpt_params params;
     d_ptr->params.n_ctx      = 2048;
-    d_ptr->params.n_parts    = params.n_parts;
     d_ptr->params.seed       = params.seed;
     d_ptr->params.f16_kv     = params.memory_f16;
     d_ptr->params.use_mmap   = params.use_mmap;
     d_ptr->params.use_mlock  = params.use_mlock;
+#if LLAMA_DATE <= 230511
+    d_ptr->params.n_parts  = params.n_parts;
+#endif
 
     d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
     if (!d_ptr->ctx) {
@@ -126,7 +157,8 @@ size_t LLamaModel::saveState(uint8_t *dest) const
 
 size_t LLamaModel::restoreState(const uint8_t *src)
 {
-    return llama_set_state_data(d_ptr->ctx, src);
+    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
+    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }
 
 void LLamaModel::prompt(const std::string &prompt,
@@ -147,7 +179,11 @@ void LLamaModel::prompt(const std::string &prompt,
     params.prompt.insert(0, 1, ' ');
 
     // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);
+    std::vector<llama_token> embd_inp(params.prompt.size() + 4);
+    int n = llama_tokenize(d_ptr->ctx, params.prompt.c_str(), embd_inp.data(), embd_inp.size(), d_ptr->empty);
+    assert(n >= 0);
+    embd_inp.resize(n);
+    d_ptr->empty = false;
 
     // save the context size
     promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx);
@@ -313,8 +349,15 @@ const char *get_build_variant() {
     return GGML_BUILD_VARIANT;
 }
 
-bool magic_match(uint32_t magic) {
-    return magic == 0x67676a74;
+bool magic_match(std::istream& f) {
+    // Check magic
+    uint32_t magic = 0;
+    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+    if (magic != 0x67676a74) return false;
+    // Check version
+    uint32_t version = 0;
+    f.read(reinterpret_cast<char*>(&version), sizeof(version));
+    return version LLAMA_VERSIONS;
 }
 
 LLModel *construct() {

diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
@@ -9,7 +9,7 @@
 
 
 static
-Dlhandle *get_implementation(uint32_t magic, const std::string& buildVariant) {
+Dlhandle *get_implementation(std::ifstream& f, const std::string& buildVariant) {
     // Collect all model implementation libraries
     static auto libs = [] () {
         std::vector<Dlhandle> fres;
@@ -31,9 +31,10 @@ Dlhandle *get_implementation(uint32_t magic, const std::string& buildVariant) {
     }();
     // Iterate over all libraries
     for (auto& dl : libs) {
+        f.seekg(0);
         // Check that magic matches
-        auto magic_match = dl.get<bool(uint32_t)>("magic_match");
-        if (!magic_match || !magic_match(magic)) {
+        auto magic_match = dl.get<bool(std::ifstream&)>("magic_match");
+        if (!magic_match || !magic_match(f)) {
             continue;
         }
         // Check that build variant is correct
@@ -55,14 +56,11 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria
     }
     // Read magic
     std::ifstream f(modelPath, std::ios::binary);
-    uint32_t magic;
-    if (!f.read(reinterpret_cast<char*>(&magic), sizeof(magic))) {
-        return nullptr;
-    }
-    f.close();
+    if (!f) return nullptr;
     // Get correct implementation
-    auto impl = get_implementation(magic, buildVariant);
+    auto impl = get_implementation(f, buildVariant);
     if (!impl) return nullptr;
+    f.close();
     // Get inference constructor
     auto constructor = impl->get<LLModel *()>("construct");
     if (!constructor) return nullptr;

diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp
@@ -1,6 +1,5 @@
 #define MPT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "mpt_impl.h"
-#include "llama.cpp/ggml.h"
 
 #include "utils.h"
 
@@ -29,6 +28,7 @@
 #include <thread>
 #include <unordered_set>
 #include <regex>
+#include <ggml.h>
 
 
 namespace {
@@ -1062,7 +1062,9 @@ const char *get_build_variant() {
     return GGML_BUILD_VARIANT;
 }
 
-bool magic_match(uint32_t magic) {
+bool magic_match(std::istream& f) {
+    uint32_t magic = 0;
+    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
     return magic == 0x67676d6d;
 }