Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1935,6 +1935,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
Expand Down
6 changes: 3 additions & 3 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
struct common_chat_msg {
std::string role;
std::string content;
std::vector<common_chat_msg_content_part> content_parts = {};
std::vector<common_chat_tool_call> tool_calls = {};
std::vector<common_chat_msg_content_part> content_parts;
std::vector<common_chat_tool_call> tool_calls;
std::string reasoning_content;
std::string tool_name;
std::string tool_call_id;
Expand All @@ -44,7 +44,7 @@ struct common_chat_msg {
bool empty() const {
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
}
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
for (auto i = 0u; i < tool_calls.size(); i++) {
if (ids_cache.size() <= i) {
auto id = tool_calls[i].id;
Expand Down
5 changes: 3 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ struct common_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool no_perf = false; // disable performance metrics
bool ctx_shift = false; // context shift on infinite text generation
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache

Expand Down Expand Up @@ -425,7 +425,8 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
Expand Down
7 changes: 2 additions & 5 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
throw std::runtime_error("failed to create ggml context for kv cache");
}

ggml_tensor * k;
ggml_tensor * v;

k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);

ggml_format_name(k, "cache_k_l%d", il);
ggml_format_name(v, "cache_v_l%d", il);
Expand Down
Loading
Loading