Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KV cache quantized to q8_0 #2969

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--kv-type" || arg == "-kvt") {
if (++i >= argc) {
invalid_param = true;
break;
}

std::string type_name(argv[i]);
for (char & c : type_name) {
c = std::tolower(c);
}

if (type_name == "q8_0") {
params.kv_type = GGML_TYPE_Q8_0;
} else if (type_name == "f16") {
params.kv_type = GGML_TYPE_F16;
} else if (type_name == "f32") {
params.kv_type = GGML_TYPE_F32;
} else {
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
invalid_param = true;
break;
}
} else if (arg == "--memory-f32") {
params.memory_f16 = false;
params.kv_type = GGML_TYPE_F32;
} else if (arg == "--top-p") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -652,8 +674,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
Expand Down Expand Up @@ -735,7 +756,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.low_vram = params.low_vram;
lparams.mul_mat_q = params.mul_mat_q;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.kv_type = params.kv_type;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
Expand Down Expand Up @@ -1201,6 +1222,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

fprintf(stream, "logit_bias:\n");
Expand All @@ -1215,7 +1237,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ struct gpt_params {
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

ggml_type kv_type = GGML_TYPE_Q8_0; // the type to use for the KV cache

bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
Expand Down
68 changes: 49 additions & 19 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ struct cmd_params {
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<int> n_batch;
std::vector<bool> f32_kv;
std::vector<ggml_type> kv_type;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<int> main_gpu;
Expand All @@ -144,7 +144,7 @@ static const cmd_params cmd_params_defaults = {
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_batch */ {512},
/* f32_kv */ {false},
/* kv_type */ {GGML_TYPE_Q8_0},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* main_gpu */ {0},
Expand All @@ -165,7 +165,16 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());

std::string kv_type_default;
for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
if (i > 0) {
kv_type_default += ",";
}
kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
}
printf(" -kvt, --kv_type <q8_0|f16|f32> (default: %s)\n", kv_type_default.c_str());

printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
Expand All @@ -177,7 +186,6 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");

}

static cmd_params parse_cmd_params(int argc, char ** argv) {
Expand Down Expand Up @@ -228,13 +236,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "--memory-f32") {
} else if (arg == "-kvt" || arg == "--kv-type") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
auto p = split<std::string>(argv[i], split_delim);

std::vector<ggml_type> kvt;
for (const std::string & type_name : p) {
if (type_name == "q8_0") {
kvt.push_back(GGML_TYPE_Q8_0);
} else if (type_name == "f16") {
kvt.push_back(GGML_TYPE_F16);
} else if (type_name == "f32") {
kvt.push_back(GGML_TYPE_F32);
} else {
invalid_param = true;
break;
}
}
if (invalid_param) {
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
break;
}

params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -332,7 +359,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
if (params.kv_type.empty()) { params.kv_type = cmd_params_defaults.kv_type; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
Expand All @@ -348,7 +375,7 @@ struct cmd_params_instance {
int n_prompt;
int n_gen;
int n_batch;
bool f32_kv;
ggml_type kv_type;
int n_threads;
int n_gpu_layers;
int main_gpu;
Expand All @@ -360,7 +387,7 @@ struct cmd_params_instance {
llama_context_params lparams = llama_context_default_params();
lparams.n_ctx = n_prompt + n_gen;
lparams.n_batch = n_batch;
lparams.f16_kv = !f32_kv;
lparams.kv_type = kv_type;
lparams.n_gpu_layers = n_gpu_layers;
lparams.main_gpu = main_gpu;
lparams.mul_mat_q = mul_mat_q;
Expand All @@ -376,7 +403,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p

for (const auto & m : params.model)
for (const auto & nb : params.n_batch)
for (const auto & fk : params.f32_kv)
for (const auto & kvt : params.kv_type)
for (const auto & nl : params.n_gpu_layers)
for (const auto & mg : params.main_gpu)
for (const auto & mmq : params.mul_mat_q)
Expand All @@ -388,7 +415,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
/* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .f32_kv = */ fk,
/* .kv_type = */ kvt,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg,
Expand Down Expand Up @@ -439,7 +466,7 @@ struct test {
uint64_t model_n_params;
int n_batch;
int n_threads;
bool f32_kv;
ggml_type kv_type;
int n_gpu_layers;
int main_gpu;
bool mul_mat_q;
Expand All @@ -459,7 +486,7 @@ struct test {
model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch;
n_threads = inst.n_threads;
f32_kv = inst.f32_kv;
kv_type = inst.kv_type;
n_gpu_layers = inst.n_gpu_layers;
main_gpu = inst.main_gpu;
mul_mat_q = inst.mul_mat_q;
Expand Down Expand Up @@ -523,7 +550,7 @@ struct test {
"cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "f16_kv",
"n_batch", "n_threads", "kv_type",
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
Expand All @@ -543,7 +570,7 @@ struct test {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
field == "mul_mat_q" || field == "low_vram") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
Expand Down Expand Up @@ -573,7 +600,7 @@ struct test {
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
Expand Down Expand Up @@ -757,8 +784,8 @@ struct markdown_printer : public printer {
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.push_back("n_batch");
}
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
fields.push_back("f16_kv");
if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
fields.push_back("kv_type");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
Expand Down Expand Up @@ -826,6 +853,9 @@ struct markdown_printer : public printer {
} else if (field == "t/s") {
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
value = buf;
} else if (field == "kv_type") {
snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
value = buf;
} else if (vmap.find(field) != vmap.end()) {
value = vmap.at(field);
} else {
Expand Down
4 changes: 2 additions & 2 deletions examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.

- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.

### Memory Float 32
### KV cache type

- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.

### Batch Size

Expand Down
17 changes: 7 additions & 10 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::vector<llama_token> * g_embd_inp;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;


static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & embd_inp, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) {
Expand All @@ -71,7 +71,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
dump_non_result_info_yaml(logfile, params, ctx, timestamp, embd_inp, model_desc);

fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
Expand All @@ -95,7 +95,7 @@ static void sigint_handler(int signo) {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
write_logfile(*g_ctx, *g_params, *g_model, *g_embd_inp, g_output_ss->str(), *g_output_tokens);
_exit(130);
}
}
Expand Down Expand Up @@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
LOG("add_bos: %d\n", add_bos);

std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_inp; g_embd_inp = &embd_inp;

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
Expand Down Expand Up @@ -465,7 +465,6 @@ int main(int argc, char ** argv) {
int n_session_consumed = 0;
int n_past_guidance = 0;

std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;

Expand Down Expand Up @@ -661,9 +660,7 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());

if (embd.size() > 1) {
input_tokens.push_back(id);
} else {
if (embd.size() == 1) {
output_tokens.push_back(id);
output_ss << token_str;
}
Expand Down Expand Up @@ -843,7 +840,7 @@ int main(int argc, char ** argv) {
}

llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
write_logfile(ctx, params, model, embd_inp, output_ss.str(), output_tokens);

if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
Expand Down
2 changes: 1 addition & 1 deletion examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ int main(int argc, char ** argv) {

lparams.n_ctx = 256;
lparams.seed = 1;
lparams.f16_kv = false;
lparams.kv_type = GGML_TYPE_F32;
lparams.use_mlock = false;

model = llama_load_model_from_file(params.model.c_str(), lparams);
Expand Down
2 changes: 1 addition & 1 deletion examples/save-load-state/save-load-state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int main(int argc, char ** argv) {

lparams.n_ctx = params.n_ctx;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.kv_type = params.kv_type;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;

Expand Down
Loading
Loading