-
Notifications
You must be signed in to change notification settings - Fork 7.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add llama compatibility with new ggml quantization #642
Changes from all commits
9a3ccb2
863513d
150135c
b4b7bb6
8025c20
4364e9d
dae05f8
85ee11f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
[submodule "llama.cpp"] | ||
path = gpt4all-backend/llama.cpp | ||
[submodule "llama.cpp-230519"] | ||
path = gpt4all-backend/llama.cpp-230519 | ||
url = https://github.com/ggerganov/llama.cpp.git | ||
[submodule "llama.cpp-230511"] | ||
path = gpt4all-backend/llama.cpp-230511 | ||
url = https://github.com/manyoso/llama.cpp.git | ||
[submodule "llama.cpp-mainline"] | ||
path = gpt4all-backend/llama.cpp-mainline | ||
url = https://github.com/ggerganov/llama.cpp.git | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) | |
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX}) | ||
|
||
# Include GGML | ||
include_ggml(llama.cpp -${BUILD_VARIANT} ON) | ||
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON) | ||
include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON) | ||
include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON) | ||
|
||
# Function for preparing individual implementations | ||
function(prepare_target TARGET_NAME BASE_LIB) | ||
|
@@ -71,18 +73,32 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) | |
PROPERTY INTERPROCEDURAL_OPTIMIZATION ${IPO_SUPPORTED}) | ||
endfunction() | ||
|
||
# Add each individual implementation | ||
add_library(llamamodel-${BUILD_VARIANT} SHARED | ||
# Add each individual implementations | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick, you don't want the plural here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed that as well, but decided to leave it as is since it's not worth a commit. Will batch this with further things that may come up. |
||
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED | ||
llamamodel.cpp) | ||
prepare_target(llamamodel llama) | ||
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE | ||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. =>= oh man cmake.. you're kiling me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haha, yup. Looks confusing, is confusing, but does what we need quite flexibly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That conditional should probably be changed to a slightly less cursed variant: #if LLAMA_VERSION <= 123456
// ...
#elif LLAMA_VERSION >= 654321
// ...
#endif At least then it would be a readily recognizable pattern of tragic stylistic compromise instead of a confusing entirely new way to crush one's hopes and dreams. Would also shrink the cmake side a little. Pardon the gallows humour, can't help it whenever pre-processor macros seem necessary. ;) |
||
prepare_target(llamamodel-mainline llama-mainline) | ||
|
||
add_library(llamamodel-230519-${BUILD_VARIANT} SHARED | ||
llamamodel.cpp) | ||
target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE | ||
LLAMA_VERSIONS===2 LLAMA_DATE=230519) | ||
prepare_target(llamamodel-230519 llama-230519) | ||
|
||
add_library(llamamodel-230511-${BUILD_VARIANT} SHARED | ||
llamamodel.cpp) | ||
target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE | ||
LLAMA_VERSIONS=<=1 LLAMA_DATE=230511) | ||
prepare_target(llamamodel-230511 llama-230511) | ||
|
||
add_library(gptj-${BUILD_VARIANT} SHARED | ||
gptj.cpp) | ||
prepare_target(gptj ggml) | ||
prepare_target(gptj ggml-230511) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait, where are you tagging the actual ggml with this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. llama.cpp.cmake adds the given suffix to ggml as well. |
||
|
||
add_library(mpt-${BUILD_VARIANT} SHARED | ||
mpt.cpp) | ||
prepare_target(mpt ggml) | ||
prepare_target(mpt ggml-230511) | ||
endforeach() | ||
|
||
add_library(llmodel | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -332,10 +332,16 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) | |
endif() | ||
|
||
if (WITH_LLAMA) | ||
# Backwards compatibility with old llama.cpp versions | ||
set(LLAMA_UTIL_SOURCE_FILE llama-util.h) | ||
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||
set(LLAMA_UTIL_SOURCE_FILE llama_util.h) | ||
endif() | ||
|
||
add_library(llama${SUFFIX} | ||
${DIRECTORY}/llama.cpp | ||
${DIRECTORY}/llama.h | ||
${DIRECTORY}/llama_util.h) | ||
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This branch doesn't actually introduce this file, right? It exists upstream in one of the pinned submodules? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The filename was changed. |
||
|
||
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}) | ||
target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,14 +28,23 @@ | |
#include <llama.h> | ||
#include <ggml.h> | ||
|
||
|
||
namespace { | ||
const char *modelType_ = "LLaMA"; | ||
} | ||
|
||
struct gpt_params { | ||
int32_t seed = -1; // RNG seed | ||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) | ||
int32_t n_keep = 0; // number of tokens to keep from initial prompt | ||
#if LLAMA_DATE <= 230511 | ||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The crux of it. We're going to use macros... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our other option would be to have an extensive collection of almost-identical There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I think this is the right choice of a bunch of bad choices. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's also CRTP and C++ template magic, but I agree it's not the time to go there yet. |
||
|
||
#if LLAMA_DATE >= 230519 | ||
// sampling parameters | ||
float tfs_z = 1.0f; // 1.0 = disabled | ||
float typical_p = 1.0f; // 1.0 = disabled | ||
#endif | ||
|
||
std::string prompt = ""; | ||
|
||
|
@@ -45,25 +54,45 @@ struct gpt_params { | |
bool use_mlock = false; // use mlock to keep model in memory | ||
}; | ||
|
||
#if LLAMA_DATE >= 230519 | ||
static int llama_sample_top_p_top_k( | ||
llama_context *ctx, | ||
const llama_token *last_n_tokens_data, | ||
int last_n_tokens_size, | ||
int top_k, | ||
float top_p, | ||
float temp, | ||
float repeat_penalty) { | ||
auto logits = llama_get_logits(ctx); | ||
auto n_vocab = llama_n_vocab(ctx); | ||
// Populate initial list of all candidates | ||
std::vector<llama_token_data> candidates; | ||
candidates.reserve(n_vocab); | ||
for (int token_id = 0; token_id < n_vocab; token_id++) { | ||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); | ||
} | ||
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; | ||
// Sample repeat penalty | ||
llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty); | ||
// Temperature sampling | ||
llama_sample_top_k(ctx, &candidates_p, top_k, 1); | ||
llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1); | ||
llama_sample_typical(ctx, &candidates_p, 1.0f, 1); | ||
llama_sample_top_p(ctx, &candidates_p, top_p, 1); | ||
llama_sample_temperature(ctx, &candidates_p, temp); | ||
return llama_sample_token(ctx, &candidates_p); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Going to assume this is giving you sane results? Have you made sure to go through and test models with each of the pinned variants and file formats? Man, we almost want regression or unit tests here... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup! I did. Man was my harddrive full.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also how it's done in the llama.cpp main example. |
||
#endif | ||
|
||
struct LLamaPrivate { | ||
const std::string modelPath; | ||
bool modelLoaded; | ||
llama_context *ctx = nullptr; | ||
llama_context_params params; | ||
int64_t n_threads = 0; | ||
bool empty = true; | ||
}; | ||
|
||
|
||
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { | ||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars | ||
std::vector<llama_token> res(text.size() + (int)add_bos); | ||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); | ||
assert(n >= 0); | ||
res.resize(n); | ||
|
||
return res; | ||
} | ||
|
||
LLamaModel::LLamaModel() | ||
: d_ptr(new LLamaPrivate) { | ||
modelType = modelType_; | ||
|
@@ -78,11 +107,13 @@ bool LLamaModel::loadModel(const std::string &modelPath) | |
|
||
gpt_params params; | ||
d_ptr->params.n_ctx = 2048; | ||
d_ptr->params.n_parts = params.n_parts; | ||
d_ptr->params.seed = params.seed; | ||
d_ptr->params.f16_kv = params.memory_f16; | ||
d_ptr->params.use_mmap = params.use_mmap; | ||
d_ptr->params.use_mlock = params.use_mlock; | ||
#if LLAMA_DATE <= 230511 | ||
d_ptr->params.n_parts = params.n_parts; | ||
#endif | ||
|
||
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params); | ||
if (!d_ptr->ctx) { | ||
|
@@ -126,7 +157,8 @@ size_t LLamaModel::saveState(uint8_t *dest) const | |
|
||
size_t LLamaModel::restoreState(const uint8_t *src) | ||
{ | ||
return llama_set_state_data(d_ptr->ctx, src); | ||
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540 | ||
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src)); | ||
} | ||
|
||
void LLamaModel::prompt(const std::string &prompt, | ||
|
@@ -147,7 +179,11 @@ void LLamaModel::prompt(const std::string &prompt, | |
params.prompt.insert(0, 1, ' '); | ||
|
||
// tokenize the prompt | ||
auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false); | ||
std::vector<llama_token> embd_inp(params.prompt.size() + 4); | ||
int n = llama_tokenize(d_ptr->ctx, params.prompt.c_str(), embd_inp.data(), embd_inp.size(), d_ptr->empty); | ||
assert(n >= 0); | ||
embd_inp.resize(n); | ||
d_ptr->empty = false; | ||
niansa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// save the context size | ||
promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx); | ||
|
@@ -313,8 +349,15 @@ const char *get_build_variant() { | |
return GGML_BUILD_VARIANT; | ||
} | ||
|
||
bool magic_match(uint32_t magic) { | ||
return magic == 0x67676a74; | ||
bool magic_match(std::istream& f) { | ||
// Check magic | ||
uint32_t magic = 0; | ||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||
if (magic != 0x67676a74) return false; | ||
// Check version | ||
uint32_t version = 0; | ||
f.read(reinterpret_cast<char*>(&version), sizeof(version)); | ||
return version LLAMA_VERSIONS; | ||
} | ||
|
||
LLModel *construct() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, ok, i get ya, but this isn't actually pinning them. Also, I think I still want all of them to use the 'manyoso' fork as this gives us further control,right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what you mean, the manyoso fork hasn't been updated to latest
llama.cpp
, it's 132 commits behind...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also that fork only adds alibi, which is only needed for MPT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean we should update that fork, and point to it I believe. lemme do that now.