From 14804b797846a3f8054e20ead853dc32c2ebe181 Mon Sep 17 00:00:00 2001 From: chrfalch Date: Sat, 1 Apr 2023 17:39:17 +0200 Subject: [PATCH 1/6] Added api for retrieving and setting the kv cache The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number. It also contains a method for setting the kv_cache from a memory buffer. This makes it possible to load/save history - maybe support --cache-prompt paramater as well? --- llama.cpp | 27 +++++++++++++++++++++++++++ llama.h | 17 +++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/llama.cpp b/llama.cpp index bed24207db776..73ad59ca53e02 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1668,6 +1668,33 @@ int llama_model_quantize( return 0; } +// Returns the KV cache that will contain the context for the +// ongoing prediction with the model. +uint8_t* llama_get_kv_cache(struct llama_context * ctx) { + return ctx->model.kv_self.buf.data(); +} + +// Returns the size of the KV cache +size_t llama_get_kv_cache_size(struct llama_context * ctx) { + return ctx->model.kv_self.buf.size(); +} + +int llama_get_kv_cache_token_count(struct llama_context * ctx) { + return ctx->model.kv_self.n; +} + +// Sets the KV cache containing the current context for the model +void llama_set_kv_cache( + struct llama_context * ctx, + uint8_t * kv_cache, + size_t n_size, + int n_token_count) { + // Make sure we have the same kv cache setup + LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); + memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + ctx->model.kv_self.n = n_token_count; +} + int llama_eval( struct llama_context * ctx, const llama_token * tokens, diff --git a/llama.h b/llama.h index 258de5a944976..5a6260d57e005 100644 --- a/llama.h +++ b/llama.h @@ -83,6 +83,23 @@ extern "C" { const char * fname_out, int itype); + // Returns the KV cache that will contain the context for the + // ongoing prediction with the model. + LLAMA_API uint8_t* llama_get_kv_cache(struct llama_context * ctx); + + // Returns the size of the KV cache + LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx); + + // Returns the number of tokens in the KV cache + LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); + + // Sets the KV cache containing the current context for the model + LLAMA_API void llama_set_kv_cache( + struct llama_context * ctx, + uint8_t * kv_cache, + size_t n_size, + int n_token_count); + // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls From a0c895c087bc2636e6972c13d76a52438f3de99a Mon Sep 17 00:00:00 2001 From: Christian Falch <875252+chrfalch@users.noreply.github.com> Date: Sat, 1 Apr 2023 18:46:14 +0200 Subject: [PATCH 2/6] Update llama.cpp Add review comments Co-authored-by: Pavol Rusnak --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 73ad59ca53e02..f6f4dda66146a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1670,7 +1670,7 @@ int llama_model_quantize( // Returns the KV cache that will contain the context for the // ongoing prediction with the model. -uint8_t* llama_get_kv_cache(struct llama_context * ctx) { +const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { return ctx->model.kv_self.buf.data(); } From f411251bcf155884e6d791a525fc87a7e04518bd Mon Sep 17 00:00:00 2001 From: Christian Falch <875252+chrfalch@users.noreply.github.com> Date: Sat, 1 Apr 2023 18:46:24 +0200 Subject: [PATCH 3/6] Update llama.cpp Added review comments Co-authored-by: Pavol Rusnak --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f6f4dda66146a..cf413d9834128 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1686,7 +1686,7 @@ int llama_get_kv_cache_token_count(struct llama_context * ctx) { // Sets the KV cache containing the current context for the model void llama_set_kv_cache( struct llama_context * ctx, - uint8_t * kv_cache, + const uint8_t * kv_cache, size_t n_size, int n_token_count) { // Make sure we have the same kv cache setup From 17f463a083d3803a473fda82f8b339e96e2f698d Mon Sep 17 00:00:00 2001 From: Christian Falch <875252+chrfalch@users.noreply.github.com> Date: Sat, 1 Apr 2023 18:46:37 +0200 Subject: [PATCH 4/6] Update llama.h Added review comments Co-authored-by: Pavol Rusnak --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index 5a6260d57e005..da8f7f600fe18 100644 --- a/llama.h +++ b/llama.h @@ -85,7 +85,7 @@ extern "C" { // Returns the KV cache that will contain the context for the // ongoing prediction with the model. - LLAMA_API uint8_t* llama_get_kv_cache(struct llama_context * ctx); + LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); // Returns the size of the KV cache LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx); From a463fb7668af6c8199de6fb9f6d73e3d675f6672 Mon Sep 17 00:00:00 2001 From: Christian Falch <875252+chrfalch@users.noreply.github.com> Date: Sat, 1 Apr 2023 18:46:47 +0200 Subject: [PATCH 5/6] Update llama.h Review Comments Co-authored-by: Pavol Rusnak --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index da8f7f600fe18..b4769ed200f52 100644 --- a/llama.h +++ b/llama.h @@ -96,7 +96,7 @@ extern "C" { // Sets the KV cache containing the current context for the model LLAMA_API void llama_set_kv_cache( struct llama_context * ctx, - uint8_t * kv_cache, + const uint8_t * kv_cache, size_t n_size, int n_token_count); From 4912f9d5f091f9b5e20d17a34f1f1f15186db0d0 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sun, 2 Apr 2023 12:18:54 +0200 Subject: [PATCH 6/6] fix whitespace --- llama.cpp | 20 ++++++++++---------- llama.h | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index cf413d9834128..ffa2b6e8f458b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1671,28 +1671,28 @@ int llama_model_quantize( // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.data(); + return ctx->model.kv_self.buf.data(); } // Returns the size of the KV cache size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size(); + return ctx->model.kv_self.buf.size(); } int llama_get_kv_cache_token_count(struct llama_context * ctx) { - return ctx->model.kv_self.n; + return ctx->model.kv_self.n; } // Sets the KV cache containing the current context for the model void llama_set_kv_cache( struct llama_context * ctx, - const uint8_t * kv_cache, - size_t n_size, - int n_token_count) { - // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); - memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); - ctx->model.kv_self.n = n_token_count; + const uint8_t * kv_cache, + size_t n_size, + int n_token_count) { + // Make sure we have the same kv cache setup + LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); + memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + ctx->model.kv_self.n = n_token_count; } int llama_eval( diff --git a/llama.h b/llama.h index b4769ed200f52..04e2bf71cd9c0 100644 --- a/llama.h +++ b/llama.h @@ -96,9 +96,9 @@ extern "C" { // Sets the KV cache containing the current context for the model LLAMA_API void llama_set_kv_cache( struct llama_context * ctx, - const uint8_t * kv_cache, - size_t n_size, - int n_token_count); + const uint8_t * kv_cache, + size_t n_size, + int n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process