From 71fcb7e27e28296bcd614b1802c8eaa1ac6f947e Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 03:30:08 -0700
Subject: [PATCH 1/6] Allow exporting a view of the KV cache

---
 common/common.cpp              | 22 +++++++++++
 common/common.h                |  6 +++
 examples/parallel/parallel.cpp |  5 +++
 llama.cpp                      | 67 ++++++++++++++++++++++++++++++++++
 llama.h                        | 23 ++++++++++++
 5 files changed, 123 insertions(+)
diff --git a/common/common.cpp b/common/common.cpp
index eec704b99f888..b40a74cf45a3c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1386,3 +1386,25 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
+
+//
+// KV cache utils
+//
+
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count);
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        int seq_count = 0;
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j].seq_id >= 0) { seq_count++; }
+        }
+        putchar(int('0' + (std::min(9, seq_count))));
+    }
+    printf("\n=== Done dumping\n");
+}
diff --git a/common/common.h b/common/common.h
index 88fa13fc067c2..58a153203f7a4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -218,3 +218,9 @@ std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
     FILE * stream, const gpt_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index a78df305f415c..439e6d0a70fe4 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -172,6 +172,8 @@ int main(int argc, char ** argv) {
     int32_t n_total_gen    = 0;
     int32_t n_cache_miss   = 0;
 
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_seq);
+
     const auto t_main_start = ggml_time_us();
 
     LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
@@ -201,6 +203,9 @@ int main(int argc, char ** argv) {
     LOG_TEE("Processing requests ...\n\n");
 
     while (true) {
+        llama_kv_cache_view_update(ctx, &kvc_view);
+        dump_kv_cache_view(kvc_view);
+
         llama_batch_clear(batch);
 
         // decode any currently ongoing sequences
diff --git a/llama.cpp b/llama.cpp
index 5679c7050d80b..e23d820ea2e20 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8805,6 +8805,73 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
     }
 }
 
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
+    struct llama_kv_cache_view result = {
+        /*.n_cells*/         0,
+        /*.n_max_seq*/       n_max_seq,
+        /*.token_count*/     0,
+        /*.used_cells*/      llama_get_kv_cache_used_cells(ctx),
+        /*.cells*/           nullptr,
+        /*.cells_sequences*/ nullptr,
+    };
+    return result;
+}
+
+void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
+    if (view->cells != nullptr) {
+        free(view->cells);
+        view->cells = nullptr;
+    }
+    if (view->cells_sequences != nullptr) {
+        free(view->cells_sequences);
+        view->cells_sequences = nullptr;
+    }
+}
+
+void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+    if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
+        view->n_cells = int32_t(ctx->kv_self.size);
+        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
+        view->cells = (struct llama_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(struct llama_kv_cache_view_cell_sequence) * view->n_max_seq * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
+        view->cells_sequences = (struct llama_kv_cache_view_cell_sequence *)p;
+    }
+
+    const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
+    llama_kv_cache_view_cell * c_curr = view->cells;
+    struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences;
+    int32_t used_cells = 0;
+    int32_t token_count = 0;
+
+    for (uint32_t i = 0; i < ctx->kv_self.size; i++, c_curr++, cs_curr += view->n_max_seq) {
+        token_count += ctx->kv_self.cells[i].seq_id.size();
+        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
+
+        int seq_idx = 0;
+        for (const llama_seq_id it : kv_cells[i].seq_id) {
+            if (seq_idx >= view->n_max_seq) {
+                break;
+            }
+            cs_curr[seq_idx].seq_id = it;
+            seq_idx++;
+        }
+        if (seq_idx != 0) {
+            used_cells++;
+        }
+        for (; seq_idx < view->n_max_seq; seq_idx++) {
+            cs_curr[seq_idx].seq_id = -1;
+        }
+    }
+    view->token_count = token_count;
+    view->used_cells = used_cells;
+    if (uint32_t(used_cells) != ctx->kv_self.used) {
+        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+            __func__, ctx->kv_self.used, used_cells);
+    }
+}
+
 int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     int result = 0;
 
diff --git a/llama.h b/llama.h
index 06b982ee022d6..72a156ac5d940 100644
--- a/llama.h
+++ b/llama.h
@@ -361,6 +361,29 @@ extern "C" {
     // KV cache
     //
 
+    struct llama_kv_cache_view_cell {
+        llama_pos pos;
+    };
+
+    struct llama_kv_cache_view_cell_sequence {
+        llama_seq_id seq_id;
+    };
+
+    struct llama_kv_cache_view {
+        int32_t n_cells;
+        int32_t n_max_seq;
+        int32_t token_count;
+        int32_t used_cells;
+        struct llama_kv_cache_view_cell *cells;
+        struct llama_kv_cache_view_cell_sequence * cells_sequences;
+    };
+
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

From cb137d8bfc785f10f66188eb958c507c02e3ba9e Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 04:11:00 -0700
Subject: [PATCH 2/6] Allow dumping the sequences per cell in common

---
 common/common.cpp              | 42 ++++++++++++++++++++++++++++++++++
 common/common.h                |  1 +
 examples/parallel/parallel.cpp |  4 ++--
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b40a74cf45a3c..e9c338028f961 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,6 +12,7 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
@@ -1408,3 +1409,44 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
     }
     printf("\n=== Done dumping\n");
 }
+
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count);
+
+    std::unordered_map<llama_seq_id, size_t> seqs;
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j].seq_id < 0) { continue; }
+            if (seqs.find(cs_curr[j].seq_id) == seqs.end()) {
+                seqs[cs_curr[j].seq_id] = seqs.size();
+                if (seqs.size() >= 10) { break; }
+            }
+        }
+        if (seqs.size() >= 10) { break; }
+    }
+    printf("=== Sequence legend: ");
+    for (const auto & it : seqs) {
+        printf("%zu=%d, ", it.second, it.first);
+    }
+
+    c_curr = view.cells;
+    cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j].seq_id >= 0) {
+                const auto & it = seqs.find(cs_curr[j].seq_id);
+                putchar(it != seqs.end() ? int('0' + it->second) : '+');
+            } else {
+                putchar('.');
+            }
+        }
+        putchar(' ');
+    }
+    printf("\n=== Done dumping\n");
+}
diff --git a/common/common.h b/common/common.h
index 58a153203f7a4..45bd0e43dff89 100644
--- a/common/common.h
+++ b/common/common.h
@@ -224,3 +224,4 @@ void dump_non_result_info_yaml(
 //
 
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 80);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 439e6d0a70fe4..8cc20b422d0ce 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
     int32_t n_total_gen    = 0;
     int32_t n_cache_miss   = 0;
 
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_seq);
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
 
     const auto t_main_start = ggml_time_us();
 
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
 
     while (true) {
         llama_kv_cache_view_update(ctx, &kvc_view);
-        dump_kv_cache_view(kvc_view);
+        dump_kv_cache_view_seqs(kvc_view, 40);
 
         llama_batch_clear(batch);
 

From 22d0485a7a65573c5d566efc4d38e9e95efbe8df Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 05:36:41 -0700
Subject: [PATCH 3/6] Track max contiguous cells value and position as well

---
 common/common.cpp | 10 +++++-----
 llama.cpp         | 38 ++++++++++++++++++++++++++++++--------
 llama.h           |  5 ++++-
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index e9c338028f961..1704b08a2826a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1393,8 +1393,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 //
 
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count);
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
     llama_kv_cache_view_cell * c_curr = view.cells;
     struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
@@ -1405,14 +1405,14 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id >= 0) { seq_count++; }
         }
-        putchar(int('0' + (std::min(9, seq_count))));
+        putchar(seq_count == 0 ? '.' : ('0' + (std::min(9, seq_count))));
     }
     printf("\n=== Done dumping\n");
 }
 
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count);
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
 
     std::unordered_map<llama_seq_id, size_t> seqs;
     llama_kv_cache_view_cell * c_curr = view.cells;
diff --git a/llama.cpp b/llama.cpp
index e23d820ea2e20..c564d636dd694 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8807,12 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
     struct llama_kv_cache_view result = {
-        /*.n_cells*/         0,
-        /*.n_max_seq*/       n_max_seq,
-        /*.token_count*/     0,
-        /*.used_cells*/      llama_get_kv_cache_used_cells(ctx),
-        /*.cells*/           nullptr,
-        /*.cells_sequences*/ nullptr,
+        /*.n_cells*/           0,
+        /*.n_max_seq*/         n_max_seq,
+        /*.token_count*/       0,
+        /*.used_cells*/        llama_get_kv_cache_used_cells(ctx),
+        /*max_contiguous*/     0,
+        /*max_contiguous_idx*/ -1,
+        /*.cells*/             nullptr,
+        /*.cells_sequences*/   nullptr,
     };
     return result;
 }
@@ -8844,11 +8846,25 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
     struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences;
     int32_t used_cells = 0;
     int32_t token_count = 0;
+    int32_t curr_contig_idx = -1;
+    uint32_t max_contig = 0;
+    int32_t max_contig_idx = -1;
 
-    for (uint32_t i = 0; i < ctx->kv_self.size; i++, c_curr++, cs_curr += view->n_max_seq) {
-        token_count += ctx->kv_self.cells[i].seq_id.size();
+    for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
+        const size_t curr_size = kv_cells[i].seq_id.size();
+        token_count += curr_size;
         c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
 
+        if (curr_size > 0) {
+            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
+                max_contig_idx = i;
+                max_contig = i - curr_contig_idx;
+            }
+            curr_contig_idx = -1;
+        } else if (curr_contig_idx < 0) {
+            curr_contig_idx = i;
+        }
+
         int seq_idx = 0;
         for (const llama_seq_id it : kv_cells[i].seq_id) {
             if (seq_idx >= view->n_max_seq) {
@@ -8864,6 +8880,12 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
             cs_curr[seq_idx].seq_id = -1;
         }
     }
+    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
+        max_contig_idx = curr_contig_idx;
+        max_contig = kv_cells.size() - curr_contig_idx;
+    }
+    view->max_contiguous_cells = max_contig;
+    view->max_contiguous_cells_idx = max_contig_idx;
     view->token_count = token_count;
     view->used_cells = used_cells;
     if (uint32_t(used_cells) != ctx->kv_self.used) {
diff --git a/llama.h b/llama.h
index 72a156ac5d940..3d3b38c12b69f 100644
--- a/llama.h
+++ b/llama.h
@@ -366,6 +366,7 @@ extern "C" {
     };
 
     struct llama_kv_cache_view_cell_sequence {
+        // Would like to have token_id here as well.
         llama_seq_id seq_id;
     };
 
@@ -374,7 +375,9 @@ extern "C" {
         int32_t n_max_seq;
         int32_t token_count;
         int32_t used_cells;
-        struct llama_kv_cache_view_cell *cells;
+        int32_t max_contiguous_cells;
+        int32_t max_contiguous_cells_idx;
+        struct llama_kv_cache_view_cell * cells;
         struct llama_kv_cache_view_cell_sequence * cells_sequences;
     };
 

From 7688d7204ff13648e2c7555c05cd4bae93995a3c Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 07:21:19 -0700
Subject: [PATCH 4/6] Fix max contiguous empty cells index calculation

Make dump functions deal with lengths or sequences counts > 10 better
---
 common/common.cpp | 16 ++++++++++------
 llama.cpp         |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1704b08a2826a..c9b13db27b9a1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1393,7 +1393,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 //
 
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n",
+    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
         view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
     llama_kv_cache_view_cell * c_curr = view.cells;
     struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
@@ -1405,13 +1407,14 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id >= 0) { seq_count++; }
         }
-        putchar(seq_count == 0 ? '.' : ('0' + (std::min(9, seq_count))));
+        putchar(slot_chars[std::min(sizeof(slot_chars) - 1, size_t(seq_count))]);
     }
     printf("\n=== Done dumping\n");
 }
 
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n",
+    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
         view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
 
     std::unordered_map<llama_seq_id, size_t> seqs;
@@ -1421,16 +1424,17 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id < 0) { continue; }
             if (seqs.find(cs_curr[j].seq_id) == seqs.end()) {
+                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
                 seqs[cs_curr[j].seq_id] = seqs.size();
-                if (seqs.size() >= 10) { break; }
             }
         }
-        if (seqs.size() >= 10) { break; }
+        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
     }
     printf("=== Sequence legend: ");
     for (const auto & it : seqs) {
         printf("%zu=%d, ", it.second, it.first);
     }
+    printf("'+'=other sequence ids");
 
     c_curr = view.cells;
     cs_curr = view.cells_sequences;
@@ -1441,7 +1445,7 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id >= 0) {
                 const auto & it = seqs.find(cs_curr[j].seq_id);
-                putchar(it != seqs.end() ? int('0' + it->second) : '+');
+                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
             } else {
                 putchar('.');
             }
diff --git a/llama.cpp b/llama.cpp
index c564d636dd694..5d1de9eece6cb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8857,8 +8857,8 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
 
         if (curr_size > 0) {
             if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
-                max_contig_idx = i;
                 max_contig = i - curr_contig_idx;
+                max_contig_idx = curr_contig_idx;
             }
             curr_contig_idx = -1;
         } else if (curr_contig_idx < 0) {

From bc1c346ae8afd90db846377dff59e4ed603473ad Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 07:44:10 -0700
Subject: [PATCH 5/6] Fix off by one error in dump_kv_cache_view

---
 common/common.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index c9b13db27b9a1..77f61dbf9f172 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1397,8 +1397,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
         view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
+
     llama_kv_cache_view_cell * c_curr = view.cells;
     struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
         if (i % row_size == 0) {
             printf("\n%5d: ", i);
@@ -1407,19 +1409,22 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id >= 0) { seq_count++; }
         }
-        putchar(slot_chars[std::min(sizeof(slot_chars) - 1, size_t(seq_count))]);
+        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
     }
+
     printf("\n=== Done dumping\n");
 }
 
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
         view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
 
     std::unordered_map<llama_seq_id, size_t> seqs;
     llama_kv_cache_view_cell * c_curr = view.cells;
     struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
         for (int j = 0; j < view.n_max_seq; j++) {
             if (cs_curr[j].seq_id < 0) { continue; }
@@ -1430,6 +1435,7 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
         }
         if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
     }
+
     printf("=== Sequence legend: ");
     for (const auto & it : seqs) {
         printf("%zu=%d, ", it.second, it.first);
@@ -1452,5 +1458,6 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
         }
         putchar(' ');
     }
+
     printf("\n=== Done dumping\n");
 }

From aa21e6dbc2295908884339f3ab91a773f027c185 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 23 Nov 2023 08:06:51 -0700
Subject: [PATCH 6/6] Add doc comments for KV cache view functions

Eliminate cell sequence struct; use llama_seq_id directly

Minor cleanups
---
 common/common.cpp | 20 ++++++++++----------
 common/common.h   |  5 ++++-
 llama.cpp         | 30 +++++++++++++++---------------
 llama.h           | 38 ++++++++++++++++++++++++++++++--------
 4 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 77f61dbf9f172..636ed35627f4a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1396,10 +1396,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
     llama_kv_cache_view_cell * c_curr = view.cells;
-    struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+    llama_seq_id * cs_curr = view.cells_sequences;
 
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
         if (i % row_size == 0) {
@@ -1407,7 +1407,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
         }
         int seq_count = 0;
         for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j].seq_id >= 0) { seq_count++; }
+            if (cs_curr[j] >= 0) { seq_count++; }
         }
         putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
     }
@@ -1419,18 +1419,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
     std::unordered_map<llama_seq_id, size_t> seqs;
     llama_kv_cache_view_cell * c_curr = view.cells;
-    struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
+    llama_seq_id * cs_curr = view.cells_sequences;
 
     for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
         for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j].seq_id < 0) { continue; }
-            if (seqs.find(cs_curr[j].seq_id) == seqs.end()) {
+            if (cs_curr[j] < 0) { continue; }
+            if (seqs.find(cs_curr[j]) == seqs.end()) {
                 if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j].seq_id] = seqs.size();
+                seqs[cs_curr[j]] = seqs.size();
             }
         }
         if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1449,8 +1449,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
             printf("\n%5d: ", i);
         }
         for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j].seq_id >= 0) {
-                const auto & it = seqs.find(cs_curr[j].seq_id);
+            if (cs_curr[j] >= 0) {
+                const auto & it = seqs.find(cs_curr[j]);
                 putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
             } else {
                 putchar('.');
diff --git a/common/common.h b/common/common.h
index 45bd0e43dff89..dc57b0ae6e174 100644
--- a/common/common.h
+++ b/common/common.h
@@ -223,5 +223,8 @@ void dump_non_result_info_yaml(
 // KV cache utils
 //
 
+// Dump the KV cache view with the number of sequences per cell.
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
diff --git a/llama.cpp b/llama.cpp
index 5d1de9eece6cb..9fb7244b41cf5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8807,14 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
     struct llama_kv_cache_view result = {
-        /*.n_cells*/           0,
-        /*.n_max_seq*/         n_max_seq,
-        /*.token_count*/       0,
-        /*.used_cells*/        llama_get_kv_cache_used_cells(ctx),
-        /*max_contiguous*/     0,
-        /*max_contiguous_idx*/ -1,
-        /*.cells*/             nullptr,
-        /*.cells_sequences*/   nullptr,
+        /*.n_cells            = */ 0,
+        /*.n_max_seq          = */ n_max_seq,
+        /*.token_count        = */ 0,
+        /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx),
+        /*.max_contiguous     = */ 0,
+        /*.max_contiguous_idx = */ -1,
+        /*.cells              = */ nullptr,
+        /*.cells_sequences    = */ nullptr,
     };
     return result;
 }
@@ -8836,14 +8836,14 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
         void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
         view->cells = (struct llama_kv_cache_view_cell *)p;
-        p = realloc(view->cells_sequences, sizeof(struct llama_kv_cache_view_cell_sequence) * view->n_max_seq * view->n_cells);
+        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
-        view->cells_sequences = (struct llama_kv_cache_view_cell_sequence *)p;
+        view->cells_sequences = (llama_seq_id *)p;
     }
 
     const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
     llama_kv_cache_view_cell * c_curr = view->cells;
-    struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences;
+    llama_seq_id * cs_curr = view->cells_sequences;
     int32_t used_cells = 0;
     int32_t token_count = 0;
     int32_t curr_contig_idx = -1;
@@ -8870,22 +8870,22 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
             if (seq_idx >= view->n_max_seq) {
                 break;
             }
-            cs_curr[seq_idx].seq_id = it;
+            cs_curr[seq_idx] = it;
             seq_idx++;
         }
         if (seq_idx != 0) {
             used_cells++;
         }
         for (; seq_idx < view->n_max_seq; seq_idx++) {
-            cs_curr[seq_idx].seq_id = -1;
+            cs_curr[seq_idx] = -1;
         }
     }
     if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
         max_contig_idx = curr_contig_idx;
         max_contig = kv_cells.size() - curr_contig_idx;
     }
-    view->max_contiguous_cells = max_contig;
-    view->max_contiguous_cells_idx = max_contig_idx;
+    view->max_contiguous = max_contig;
+    view->max_contiguous_idx = max_contig_idx;
     view->token_count = token_count;
     view->used_cells = used_cells;
     if (uint32_t(used_cells) != ctx->kv_self.used) {
diff --git a/llama.h b/llama.h
index 3d3b38c12b69f..3208f158cf833 100644
--- a/llama.h
+++ b/llama.h
@@ -361,30 +361,52 @@ extern "C" {
     // KV cache
     //
 
+    // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
         llama_pos pos;
     };
 
-    struct llama_kv_cache_view_cell_sequence {
-        // Would like to have token_id here as well.
-        llama_seq_id seq_id;
-    };
-
+    // An updateable view of the KV cache.
     struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
         int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
         int32_t n_max_seq;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
         int32_t token_count;
+
+        // Number of populated cache cells.
         int32_t used_cells;
-        int32_t max_contiguous_cells;
-        int32_t max_contiguous_cells_idx;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
         struct llama_kv_cache_view_cell * cells;
-        struct llama_kv_cache_view_cell_sequence * cells_sequences;
+
+        // The sequences for each cell. There will be n_max_seq items per cell.
+        llama_seq_id * cells_sequences;
     };
 
+    // Create an empty KV cache view.
     LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
 
+    // Free a KV cache view.
     LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
 
+    // Update the KV cache view structure with the current state of the KV cache.
     LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)