From 71fcb7e27e28296bcd614b1802c8eaa1ac6f947e Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 03:30:08 -0700 Subject: [PATCH 1/6] Allow exporting a view of the KV cache --- common/common.cpp | 22 +++++++++++ common/common.h | 6 +++ examples/parallel/parallel.cpp | 5 +++ llama.cpp | 67 ++++++++++++++++++++++++++++++++++ llama.h | 23 ++++++++++++ 5 files changed, 123 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index eec704b99f888..b40a74cf45a3c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1386,3 +1386,25 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); } + +// +// KV cache utils +// + +void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + llama_kv_cache_view_cell * c_curr = view.cells; + struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + if (i % row_size == 0) { + printf("\n%5d: ", i); + } + int seq_count = 0; + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j].seq_id >= 0) { seq_count++; } + } + putchar(int('0' + (std::min(9, seq_count)))); + } + printf("\n=== Done dumping\n"); +} diff --git a/common/common.h b/common/common.h index 88fa13fc067c2..58a153203f7a4 100644 --- a/common/common.h +++ b/common/common.h @@ -218,3 +218,9 @@ std::string get_sortable_timestamp(); void dump_non_result_info_yaml( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); + +// +// KV cache utils +// + +void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index a78df305f415c..439e6d0a70fe4 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -172,6 +172,8 @@ int main(int argc, char ** argv) { int32_t n_total_gen = 0; int32_t n_cache_miss = 0; + struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_seq); + const auto t_main_start = ggml_time_us(); LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); @@ -201,6 +203,9 @@ int main(int argc, char ** argv) { LOG_TEE("Processing requests ...\n\n"); while (true) { + llama_kv_cache_view_update(ctx, &kvc_view); + dump_kv_cache_view(kvc_view); + llama_batch_clear(batch); // decode any currently ongoing sequences diff --git a/llama.cpp b/llama.cpp index 5679c7050d80b..e23d820ea2e20 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8805,6 +8805,73 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } } +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { + struct llama_kv_cache_view result = { + /*.n_cells*/ 0, + /*.n_max_seq*/ n_max_seq, + /*.token_count*/ 0, + /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), + /*.cells*/ nullptr, + /*.cells_sequences*/ nullptr, + }; + return result; +} + +void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { + if (view->cells != nullptr) { + free(view->cells); + view->cells = nullptr; + } + if (view->cells_sequences != nullptr) { + free(view->cells_sequences); + view->cells_sequences = nullptr; + } +} + +void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) { + if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) { + view->n_cells = int32_t(ctx->kv_self.size); + void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); + view->cells = (struct llama_kv_cache_view_cell *)p; + p = realloc(view->cells_sequences, sizeof(struct llama_kv_cache_view_cell_sequence) * view->n_max_seq * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); + view->cells_sequences = (struct llama_kv_cache_view_cell_sequence *)p; + } + + const std::vector & kv_cells = ctx->kv_self.cells; + llama_kv_cache_view_cell * c_curr = view->cells; + struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences; + int32_t used_cells = 0; + int32_t token_count = 0; + + for (uint32_t i = 0; i < ctx->kv_self.size; i++, c_curr++, cs_curr += view->n_max_seq) { + token_count += ctx->kv_self.cells[i].seq_id.size(); + c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + + int seq_idx = 0; + for (const llama_seq_id it : kv_cells[i].seq_id) { + if (seq_idx >= view->n_max_seq) { + break; + } + cs_curr[seq_idx].seq_id = it; + seq_idx++; + } + if (seq_idx != 0) { + used_cells++; + } + for (; seq_idx < view->n_max_seq; seq_idx++) { + cs_curr[seq_idx].seq_id = -1; + } + } + view->token_count = token_count; + view->used_cells = used_cells; + if (uint32_t(used_cells) != ctx->kv_self.used) { + LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n", + __func__, ctx->kv_self.used, used_cells); + } +} + int llama_get_kv_cache_token_count(const struct llama_context * ctx) { int result = 0; diff --git a/llama.h b/llama.h index 06b982ee022d6..72a156ac5d940 100644 --- a/llama.h +++ b/llama.h @@ -361,6 +361,29 @@ extern "C" { // KV cache // + struct llama_kv_cache_view_cell { + llama_pos pos; + }; + + struct llama_kv_cache_view_cell_sequence { + llama_seq_id seq_id; + }; + + struct llama_kv_cache_view { + int32_t n_cells; + int32_t n_max_seq; + int32_t token_count; + int32_t used_cells; + struct llama_kv_cache_view_cell *cells; + struct llama_kv_cache_view_cell_sequence * cells_sequences; + }; + + LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq); + + LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); + + LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); + // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); From cb137d8bfc785f10f66188eb958c507c02e3ba9e Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 04:11:00 -0700 Subject: [PATCH 2/6] Allow dumping the sequences per cell in common --- common/common.cpp | 42 ++++++++++++++++++++++++++++++++++ common/common.h | 1 + examples/parallel/parallel.cpp | 4 ++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b40a74cf45a3c..e9c338028f961 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1408,3 +1409,44 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { } printf("\n=== Done dumping\n"); } + +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + + std::unordered_map seqs; + llama_kv_cache_view_cell * c_curr = view.cells; + struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j].seq_id < 0) { continue; } + if (seqs.find(cs_curr[j].seq_id) == seqs.end()) { + seqs[cs_curr[j].seq_id] = seqs.size(); + if (seqs.size() >= 10) { break; } + } + } + if (seqs.size() >= 10) { break; } + } + printf("=== Sequence legend: "); + for (const auto & it : seqs) { + printf("%zu=%d, ", it.second, it.first); + } + + c_curr = view.cells; + cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + if (i % row_size == 0) { + printf("\n%5d: ", i); + } + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j].seq_id >= 0) { + const auto & it = seqs.find(cs_curr[j].seq_id); + putchar(it != seqs.end() ? int('0' + it->second) : '+'); + } else { + putchar('.'); + } + } + putchar(' '); + } + printf("\n=== Done dumping\n"); +} diff --git a/common/common.h b/common/common.h index 58a153203f7a4..45bd0e43dff89 100644 --- a/common/common.h +++ b/common/common.h @@ -224,3 +224,4 @@ void dump_non_result_info_yaml( // void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 80); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 439e6d0a70fe4..8cc20b422d0ce 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { int32_t n_total_gen = 0; int32_t n_cache_miss = 0; - struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_seq); + struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients); const auto t_main_start = ggml_time_us(); @@ -204,7 +204,7 @@ int main(int argc, char ** argv) { while (true) { llama_kv_cache_view_update(ctx, &kvc_view); - dump_kv_cache_view(kvc_view); + dump_kv_cache_view_seqs(kvc_view, 40); llama_batch_clear(batch); From 22d0485a7a65573c5d566efc4d38e9e95efbe8df Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 05:36:41 -0700 Subject: [PATCH 3/6] Track max contiguous cells value and position as well --- common/common.cpp | 10 +++++----- llama.cpp | 38 ++++++++++++++++++++++++++++++-------- llama.h | 5 ++++- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index e9c338028f961..1704b08a2826a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1393,8 +1393,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l // void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); llama_kv_cache_view_cell * c_curr = view.cells; struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { @@ -1405,14 +1405,14 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id >= 0) { seq_count++; } } - putchar(int('0' + (std::min(9, seq_count)))); + putchar(seq_count == 0 ? '.' : ('0' + (std::min(9, seq_count)))); } printf("\n=== Done dumping\n"); } void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; diff --git a/llama.cpp b/llama.cpp index e23d820ea2e20..c564d636dd694 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8807,12 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { struct llama_kv_cache_view result = { - /*.n_cells*/ 0, - /*.n_max_seq*/ n_max_seq, - /*.token_count*/ 0, - /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), - /*.cells*/ nullptr, - /*.cells_sequences*/ nullptr, + /*.n_cells*/ 0, + /*.n_max_seq*/ n_max_seq, + /*.token_count*/ 0, + /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), + /*max_contiguous*/ 0, + /*max_contiguous_idx*/ -1, + /*.cells*/ nullptr, + /*.cells_sequences*/ nullptr, }; return result; } @@ -8844,11 +8846,25 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences; int32_t used_cells = 0; int32_t token_count = 0; + int32_t curr_contig_idx = -1; + uint32_t max_contig = 0; + int32_t max_contig_idx = -1; - for (uint32_t i = 0; i < ctx->kv_self.size; i++, c_curr++, cs_curr += view->n_max_seq) { - token_count += ctx->kv_self.cells[i].seq_id.size(); + for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { + const size_t curr_size = kv_cells[i].seq_id.size(); + token_count += curr_size; c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + if (curr_size > 0) { + if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { + max_contig_idx = i; + max_contig = i - curr_contig_idx; + } + curr_contig_idx = -1; + } else if (curr_contig_idx < 0) { + curr_contig_idx = i; + } + int seq_idx = 0; for (const llama_seq_id it : kv_cells[i].seq_id) { if (seq_idx >= view->n_max_seq) { @@ -8864,6 +8880,12 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k cs_curr[seq_idx].seq_id = -1; } } + if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { + max_contig_idx = curr_contig_idx; + max_contig = kv_cells.size() - curr_contig_idx; + } + view->max_contiguous_cells = max_contig; + view->max_contiguous_cells_idx = max_contig_idx; view->token_count = token_count; view->used_cells = used_cells; if (uint32_t(used_cells) != ctx->kv_self.used) { diff --git a/llama.h b/llama.h index 72a156ac5d940..3d3b38c12b69f 100644 --- a/llama.h +++ b/llama.h @@ -366,6 +366,7 @@ extern "C" { }; struct llama_kv_cache_view_cell_sequence { + // Would like to have token_id here as well. llama_seq_id seq_id; }; @@ -374,7 +375,9 @@ extern "C" { int32_t n_max_seq; int32_t token_count; int32_t used_cells; - struct llama_kv_cache_view_cell *cells; + int32_t max_contiguous_cells; + int32_t max_contiguous_cells_idx; + struct llama_kv_cache_view_cell * cells; struct llama_kv_cache_view_cell_sequence * cells_sequences; }; From 7688d7204ff13648e2c7555c05cd4bae93995a3c Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 07:21:19 -0700 Subject: [PATCH 4/6] Fix max contiguous empty cells index calculation Make dump functions deal with lengths or sequences counts > 10 better --- common/common.cpp | 16 ++++++++++------ llama.cpp | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1704b08a2826a..c9b13db27b9a1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1393,7 +1393,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l // void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; + + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); llama_kv_cache_view_cell * c_curr = view.cells; struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; @@ -1405,13 +1407,14 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id >= 0) { seq_count++; } } - putchar(seq_count == 0 ? '.' : ('0' + (std::min(9, seq_count)))); + putchar(slot_chars[std::min(sizeof(slot_chars) - 1, size_t(seq_count))]); } printf("\n=== Done dumping\n"); } void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); std::unordered_map seqs; @@ -1421,16 +1424,17 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id < 0) { continue; } if (seqs.find(cs_curr[j].seq_id) == seqs.end()) { + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } seqs[cs_curr[j].seq_id] = seqs.size(); - if (seqs.size() >= 10) { break; } } } - if (seqs.size() >= 10) { break; } + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } } printf("=== Sequence legend: "); for (const auto & it : seqs) { printf("%zu=%d, ", it.second, it.first); } + printf("'+'=other sequence ids"); c_curr = view.cells; cs_curr = view.cells_sequences; @@ -1441,7 +1445,7 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id >= 0) { const auto & it = seqs.find(cs_curr[j].seq_id); - putchar(it != seqs.end() ? int('0' + it->second) : '+'); + putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); } else { putchar('.'); } diff --git a/llama.cpp b/llama.cpp index c564d636dd694..5d1de9eece6cb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8857,8 +8857,8 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k if (curr_size > 0) { if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { - max_contig_idx = i; max_contig = i - curr_contig_idx; + max_contig_idx = curr_contig_idx; } curr_contig_idx = -1; } else if (curr_contig_idx < 0) { From bc1c346ae8afd90db846377dff59e4ed603473ad Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 07:44:10 -0700 Subject: [PATCH 5/6] Fix off by one error in dump_kv_cache_view --- common/common.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index c9b13db27b9a1..77f61dbf9f172 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1397,8 +1397,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); + llama_kv_cache_view_cell * c_curr = view.cells; struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { if (i % row_size == 0) { printf("\n%5d: ", i); @@ -1407,19 +1409,22 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id >= 0) { seq_count++; } } - putchar(slot_chars[std::min(sizeof(slot_chars) - 1, size_t(seq_count))]); + putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); } + printf("\n=== Done dumping\n"); } void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id < 0) { continue; } @@ -1430,6 +1435,7 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } } + printf("=== Sequence legend: "); for (const auto & it : seqs) { printf("%zu=%d, ", it.second, it.first); @@ -1452,5 +1458,6 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { } putchar(' '); } + printf("\n=== Done dumping\n"); } From aa21e6dbc2295908884339f3ab91a773f027c185 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 23 Nov 2023 08:06:51 -0700 Subject: [PATCH 6/6] Add doc comments for KV cache view functions Eliminate cell sequence struct; use llama_seq_id directly Minor cleanups --- common/common.cpp | 20 ++++++++++---------- common/common.h | 5 ++++- llama.cpp | 30 +++++++++++++++--------------- llama.h | 38 ++++++++++++++++++++++++++++++-------- 4 files changed, 59 insertions(+), 34 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 77f61dbf9f172..636ed35627f4a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1396,10 +1396,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); llama_kv_cache_view_cell * c_curr = view.cells; - struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + llama_seq_id * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { if (i % row_size == 0) { @@ -1407,7 +1407,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { } int seq_count = 0; for (int j = 0; j < view.n_max_seq; j++) { - if (cs_curr[j].seq_id >= 0) { seq_count++; } + if (cs_curr[j] >= 0) { seq_count++; } } putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); } @@ -1419,18 +1419,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; - struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; + llama_seq_id * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { for (int j = 0; j < view.n_max_seq; j++) { - if (cs_curr[j].seq_id < 0) { continue; } - if (seqs.find(cs_curr[j].seq_id) == seqs.end()) { + if (cs_curr[j] < 0) { continue; } + if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } - seqs[cs_curr[j].seq_id] = seqs.size(); + seqs[cs_curr[j]] = seqs.size(); } } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } @@ -1449,8 +1449,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { printf("\n%5d: ", i); } for (int j = 0; j < view.n_max_seq; j++) { - if (cs_curr[j].seq_id >= 0) { - const auto & it = seqs.find(cs_curr[j].seq_id); + if (cs_curr[j] >= 0) { + const auto & it = seqs.find(cs_curr[j]); putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); } else { putchar('.'); diff --git a/common/common.h b/common/common.h index 45bd0e43dff89..dc57b0ae6e174 100644 --- a/common/common.h +++ b/common/common.h @@ -223,5 +223,8 @@ void dump_non_result_info_yaml( // KV cache utils // +// Dump the KV cache view with the number of sequences per cell. void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); -void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 80); + +// Dump the KV cache view showing individual sequences in each cell (long output). +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); diff --git a/llama.cpp b/llama.cpp index 5d1de9eece6cb..9fb7244b41cf5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8807,14 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { struct llama_kv_cache_view result = { - /*.n_cells*/ 0, - /*.n_max_seq*/ n_max_seq, - /*.token_count*/ 0, - /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), - /*max_contiguous*/ 0, - /*max_contiguous_idx*/ -1, - /*.cells*/ nullptr, - /*.cells_sequences*/ nullptr, + /*.n_cells = */ 0, + /*.n_max_seq = */ n_max_seq, + /*.token_count = */ 0, + /*.used_cells = */ llama_get_kv_cache_used_cells(ctx), + /*.max_contiguous = */ 0, + /*.max_contiguous_idx = */ -1, + /*.cells = */ nullptr, + /*.cells_sequences = */ nullptr, }; return result; } @@ -8836,14 +8836,14 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); view->cells = (struct llama_kv_cache_view_cell *)p; - p = realloc(view->cells_sequences, sizeof(struct llama_kv_cache_view_cell_sequence) * view->n_max_seq * view->n_cells); + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); - view->cells_sequences = (struct llama_kv_cache_view_cell_sequence *)p; + view->cells_sequences = (llama_seq_id *)p; } const std::vector & kv_cells = ctx->kv_self.cells; llama_kv_cache_view_cell * c_curr = view->cells; - struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences; + llama_seq_id * cs_curr = view->cells_sequences; int32_t used_cells = 0; int32_t token_count = 0; int32_t curr_contig_idx = -1; @@ -8870,22 +8870,22 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k if (seq_idx >= view->n_max_seq) { break; } - cs_curr[seq_idx].seq_id = it; + cs_curr[seq_idx] = it; seq_idx++; } if (seq_idx != 0) { used_cells++; } for (; seq_idx < view->n_max_seq; seq_idx++) { - cs_curr[seq_idx].seq_id = -1; + cs_curr[seq_idx] = -1; } } if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { max_contig_idx = curr_contig_idx; max_contig = kv_cells.size() - curr_contig_idx; } - view->max_contiguous_cells = max_contig; - view->max_contiguous_cells_idx = max_contig_idx; + view->max_contiguous = max_contig; + view->max_contiguous_idx = max_contig_idx; view->token_count = token_count; view->used_cells = used_cells; if (uint32_t(used_cells) != ctx->kv_self.used) { diff --git a/llama.h b/llama.h index 3d3b38c12b69f..3208f158cf833 100644 --- a/llama.h +++ b/llama.h @@ -361,30 +361,52 @@ extern "C" { // KV cache // + // Information associated with an individual cell in the KV cache view. struct llama_kv_cache_view_cell { + // The position for this cell. Takes KV cache shifts into account. + // May be negative if the cell is not populated. llama_pos pos; }; - struct llama_kv_cache_view_cell_sequence { - // Would like to have token_id here as well. - llama_seq_id seq_id; - }; - + // An updateable view of the KV cache. struct llama_kv_cache_view { + // Number of KV cache cells. This will be the same as the context size. int32_t n_cells; + + // Maximum number of sequences that can exist in a cell. It's not an error + // if there are more sequences in a cell than this value, however they will + // not be visible in the view cells_sequences. int32_t n_max_seq; + + // Number of tokens in the cache. For example, if there are two populated + // cells, the first with 1 sequence id in it and the second with 2 sequence + // ids then you'll have 3 tokens. int32_t token_count; + + // Number of populated cache cells. int32_t used_cells; - int32_t max_contiguous_cells; - int32_t max_contiguous_cells_idx; + + // Maximum contiguous empty slots in the cache. + int32_t max_contiguous; + + // Index to the start of the max_contiguous slot range. Can be negative + // when cache is full. + int32_t max_contiguous_idx; + + // Information for an individual cell. struct llama_kv_cache_view_cell * cells; - struct llama_kv_cache_view_cell_sequence * cells_sequences; + + // The sequences for each cell. There will be n_max_seq items per cell. + llama_seq_id * cells_sequences; }; + // Create an empty KV cache view. LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq); + // Free a KV cache view. LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); + // Update the KV cache view structure with the current state of the KV cache. LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); // Returns the number of tokens in the KV cache (slow, use only for debug)