From 14804b797846a3f8054e20ead853dc32c2ebe181 Mon Sep 17 00:00:00 2001
From: chrfalch <christian.falch@gmail.com>
Date: Sat, 1 Apr 2023 17:39:17 +0200
Subject: [PATCH 1/6] Added api for retrieving and setting the kv cache

The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number.
It also contains a method for setting the kv_cache from a memory buffer.

This makes it possible to load/save history - maybe support --cache-prompt paramater as well?
---
 llama.cpp | 27 +++++++++++++++++++++++++++
 llama.h   | 17 +++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index bed24207db776..73ad59ca53e02 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1668,6 +1668,33 @@ int llama_model_quantize(
     return 0;
 }
 
+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+uint8_t* llama_get_kv_cache(struct llama_context * ctx) {
+  return ctx->model.kv_self.buf.data();
+}
+
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+  return ctx->model.kv_self.buf.size();
+}
+
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+  return ctx->model.kv_self.n;
+}
+
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+                     uint8_t * kv_cache,
+                        size_t n_size,
+                           int n_token_count) {
+  // Make sure we have the same kv cache setup
+  LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+  memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+  ctx->model.kv_self.n = n_token_count;
+}
+
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,
diff --git a/llama.h b/llama.h
index 258de5a944976..5a6260d57e005 100644
--- a/llama.h
+++ b/llama.h
@@ -83,6 +83,23 @@ extern "C" {
             const char * fname_out,
                    int   itype);
 
+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API uint8_t* llama_get_kv_cache(struct llama_context * ctx);
+
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                         uint8_t * kv_cache,
+                            size_t n_size,
+                               int n_token_count);
+
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls

From a0c895c087bc2636e6972c13d76a52438f3de99a Mon Sep 17 00:00:00 2001
From: Christian Falch <875252+chrfalch@users.noreply.github.com>
Date: Sat, 1 Apr 2023 18:46:14 +0200
Subject: [PATCH 2/6] Update llama.cpp

Add review comments

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 73ad59ca53e02..f6f4dda66146a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1670,7 +1670,7 @@ int llama_model_quantize(
 
 // Returns the KV cache that will contain the context for the
 // ongoing prediction with the model.
-uint8_t* llama_get_kv_cache(struct llama_context * ctx) {
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
   return ctx->model.kv_self.buf.data();
 }
 

From f411251bcf155884e6d791a525fc87a7e04518bd Mon Sep 17 00:00:00 2001
From: Christian Falch <875252+chrfalch@users.noreply.github.com>
Date: Sat, 1 Apr 2023 18:46:24 +0200
Subject: [PATCH 3/6] Update llama.cpp

Added review comments

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index f6f4dda66146a..cf413d9834128 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1686,7 +1686,7 @@ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
 // Sets the KV cache containing the current context for the model
 void llama_set_kv_cache(
         struct llama_context * ctx,
-                     uint8_t * kv_cache,
+                     const uint8_t * kv_cache,
                         size_t n_size,
                            int n_token_count) {
   // Make sure we have the same kv cache setup

From 17f463a083d3803a473fda82f8b339e96e2f698d Mon Sep 17 00:00:00 2001
From: Christian Falch <875252+chrfalch@users.noreply.github.com>
Date: Sat, 1 Apr 2023 18:46:37 +0200
Subject: [PATCH 4/6] Update llama.h

Added review comments

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.h b/llama.h
index 5a6260d57e005..da8f7f600fe18 100644
--- a/llama.h
+++ b/llama.h
@@ -85,7 +85,7 @@ extern "C" {
 
     // Returns the KV cache that will contain the context for the
     // ongoing prediction with the model.
-    LLAMA_API uint8_t* llama_get_kv_cache(struct llama_context * ctx);
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
 
     // Returns the size of the KV cache
     LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);

From a463fb7668af6c8199de6fb9f6d73e3d675f6672 Mon Sep 17 00:00:00 2001
From: Christian Falch <875252+chrfalch@users.noreply.github.com>
Date: Sat, 1 Apr 2023 18:46:47 +0200
Subject: [PATCH 5/6] Update llama.h

Review Comments

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.h b/llama.h
index da8f7f600fe18..b4769ed200f52 100644
--- a/llama.h
+++ b/llama.h
@@ -96,7 +96,7 @@ extern "C" {
     // Sets the KV cache containing the current context for the model
     LLAMA_API void llama_set_kv_cache(
             struct llama_context * ctx,
-                         uint8_t * kv_cache,
+                         const uint8_t * kv_cache,
                             size_t n_size,
                                int n_token_count);
 

From 4912f9d5f091f9b5e20d17a34f1f1f15186db0d0 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Sun, 2 Apr 2023 12:18:54 +0200
Subject: [PATCH 6/6] fix whitespace

---
 llama.cpp | 20 ++++++++++----------
 llama.h   |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cf413d9834128..ffa2b6e8f458b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1671,28 +1671,28 @@ int llama_model_quantize(
 // Returns the KV cache that will contain the context for the
 // ongoing prediction with the model.
 const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
-  return ctx->model.kv_self.buf.data();
+    return ctx->model.kv_self.buf.data();
 }
 
 // Returns the size of the KV cache
 size_t llama_get_kv_cache_size(struct llama_context * ctx) {
-  return ctx->model.kv_self.buf.size();
+    return ctx->model.kv_self.buf.size();
 }
 
 int llama_get_kv_cache_token_count(struct llama_context * ctx) {
-  return ctx->model.kv_self.n;
+    return ctx->model.kv_self.n;
 }
 
 // Sets the KV cache containing the current context for the model
 void llama_set_kv_cache(
         struct llama_context * ctx,
-                     const uint8_t * kv_cache,
-                        size_t n_size,
-                           int n_token_count) {
-  // Make sure we have the same kv cache setup
-  LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
-  memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
-  ctx->model.kv_self.n = n_token_count;
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
 }
 
 int llama_eval(
diff --git a/llama.h b/llama.h
index b4769ed200f52..04e2bf71cd9c0 100644
--- a/llama.h
+++ b/llama.h
@@ -96,9 +96,9 @@ extern "C" {
     // Sets the KV cache containing the current context for the model
     LLAMA_API void llama_set_kv_cache(
             struct llama_context * ctx,
-                         const uint8_t * kv_cache,
-                            size_t n_size,
-                               int n_token_count);
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);
 
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process