From 4ef5e792e378a2a0ed42969de89d5cb161fb9530 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Aug 2023 16:30:07 +0300 Subject: [PATCH] llama : replace gguf_file_saver with new gguf write API --- examples/gguf/gguf.cpp | 3 +- ggml.c | 29 ++++--- ggml.h | 8 ++ gguf-llama.cpp | 180 +++++------------------------------------ 4 files changed, 47 insertions(+), 173 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 934e774056463..f67af1416a7c8 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -193,8 +193,7 @@ bool gguf_ex_read_1(const std::string & fname) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", - __func__, i, cur->n_dims, cur->name, cur->data); + fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); // check data { diff --git a/ggml.c b/ggml.c index c69a183e85a17..ead9ab526328e 100644 --- a/ggml.c +++ b/ggml.c @@ -16903,7 +16903,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // compute size of intermediate results // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += ggml_nbytes(cgraph->nodes[i]); + size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } // print @@ -18629,8 +18629,9 @@ struct gguf_tensor_info { uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` - // for writing - const struct ggml_tensor * tensor; + // for writing API + const void * data; + size_t size; }; struct gguf_context { @@ -19268,7 +19269,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { } } -void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) { +void gguf_add_tensor_ex( + struct gguf_context * ctx, + const struct ggml_tensor * tensor, + enum ggml_type type, + const void * data, + size_t size) { const int idx = ctx->header.n_tensors; ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); @@ -19284,17 +19290,22 @@ void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tenso ctx->infos[idx].ne[i] = tensor->ne[i]; } - ctx->infos[idx].type = tensor->type; + ctx->infos[idx].type = type; ctx->infos[idx].offset = 0; - ctx->infos[idx].tensor = tensor; + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; if (ctx->header.n_tensors > 0) { - ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ggml_nbytes(ctx->infos[idx - 1].tensor), ctx->alignment); + ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); } ctx->header.n_tensors++; } +void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) { + gguf_add_tensor_ex(ctx, tensor, tensor->type, tensor->data, ggml_nbytes(tensor)); +} + static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { fwrite(&val->n, sizeof(val->n), 1, file); fwrite(val->data, sizeof(char), val->n, file); @@ -19396,10 +19407,10 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; - const size_t size = ggml_nbytes(info->tensor); + const size_t size = info->size; const size_t size_pad = GGML_PAD(size, ctx->alignment); - gguf_fwrite_el(file, info->tensor->data, size); + gguf_fwrite_el(file, info->data, size); if (size_pad != size) { uint8_t pad = 0; diff --git a/ggml.h b/ggml.h index 5984d433da3bd..368cb00cb00c3 100644 --- a/ggml.h +++ b/ggml.h @@ -1791,6 +1791,14 @@ extern "C" { GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + // same as gguf_add_tensor, but allows to override tensor data + GGML_API void gguf_add_tensor_ex( + struct gguf_context * ctx, + const struct ggml_tensor * tensor, + enum ggml_type type, + const void * data, + size_t size); + GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname); // diff --git a/gguf-llama.cpp b/gguf-llama.cpp index 684b30936c1cc..e738060448e27 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -695,6 +695,7 @@ struct gguf_file_loader { tensor.name = name; tensor.size = ggml_nbytes(cur); + tensor.ggml_tensor = cur; tensors_map.tensors.push_back(tensor); tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; @@ -702,165 +703,6 @@ struct gguf_file_loader { } }; -struct gguf_file_saver { - // TODO - // this implementation now assumes that the data section is of the same length as the unquantized model. - // this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file. - // this may not be true when we add quantization version and change ftype description (currently it's string according to the specs, - // but better to have it as uint32). - // we need to calculate the delta in number of bytes written with a counter as a struct member. - - gguf_context * ctx; // loaded gguf context (used to re-write the KV section (good enough for now)) - - gguf_file file; - size_t info_offset; - size_t tensor_offset; - - gguf_file_saver(const char * fname, gguf_context * ctx) : ctx(ctx), file(fname, "wb") { - LLAMA_LOG_INFO("%s: saving model to %s\n", __func__, fname); - - write_header(); - write_kv(); - } - - void write_header() { - file.write_i32(GGUF_MAGIC); - file.write_i32(GGUF_VERSION); - file.write_i32(gguf_get_n_tensors(ctx)); - file.write_i32(gguf_get_n_kv (ctx)); - } - - void write_kv_arr_i32(const std::string & key, enum gguf_type type, int i, int n_arr) { - std::vector data(n_arr); - - for (int j = 0; j < n_arr; ++j) { - int32_t val = gguf_get_arr_i32(ctx, i, j); - data[j] = val; - } - - file.write_arr(key, type, data); - } - - void write_kv_arr_f32(const std::string & key, enum gguf_type type, int i, int n_arr) { - std::vector data(n_arr); - - for (int j = 0; j < n_arr; ++j) { - float val = gguf_get_arr_f32(ctx, i, j); - data[j] = val; - } - - file.write_arr(key, type, data); - } - - void write_kv_arr_str(const std::string & key, enum gguf_type type, int i, int n_arr) { - std::vector data(n_arr); - - for (int j = 0; j < n_arr; ++j) { - std::string val = gguf_get_arr_str(ctx, i, j); - data[j] = val; - } - - file.write_arr(key, type, data); - } - - // re-write the key-value section from the loaded file - void write_kv() { - const int32_t n_kv = gguf_get_n_kv(ctx); - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ctx, i); - LLAMA_LOG_INFO("%s: writing key '%s'\n", __func__, key); - - if (strcmp(key, "general.quantization_version") == 0) { - file.write_val("general.quantization_version", GGUF_TYPE_UINT32, GGML_QNT_VERSION); - } else { - const gguf_type vtype = gguf_get_kv_type(ctx, i); - - switch (vtype) { - case GGUF_TYPE_BOOL: file.write_val (key, GGUF_TYPE_BOOL, gguf_get_val_bool(ctx, i)); break; - case GGUF_TYPE_FLOAT32: file.write_val (key, GGUF_TYPE_FLOAT32, gguf_get_val_f32 (ctx, i)); break; - case GGUF_TYPE_INT16: file.write_val (key, GGUF_TYPE_INT16, gguf_get_val_i16 (ctx, i)); break; - case GGUF_TYPE_INT32: file.write_val (key, GGUF_TYPE_INT32, gguf_get_val_i32 (ctx, i)); break; - case GGUF_TYPE_INT8: file.write_val (key, GGUF_TYPE_INT8, gguf_get_val_i8 (ctx, i)); break; - case GGUF_TYPE_STRING: file.write_str (key, GGUF_TYPE_STRING, gguf_get_val_str (ctx, i)); break; - case GGUF_TYPE_UINT16: file.write_val(key, GGUF_TYPE_UINT16, gguf_get_val_u16 (ctx, i)); break; - case GGUF_TYPE_UINT32: file.write_val(key, GGUF_TYPE_UINT32, gguf_get_val_u32 (ctx, i)); break; - case GGUF_TYPE_UINT8: file.write_val (key, GGUF_TYPE_UINT8, gguf_get_val_u8 (ctx, i)); break; - case GGUF_TYPE_ARRAY: - { - const gguf_type arr_type = gguf_get_arr_type(ctx, i); - const int n_arr = gguf_get_arr_n (ctx, i); - - switch (arr_type) { - case GGUF_TYPE_FLOAT32: write_kv_arr_f32(key, arr_type, i, n_arr); break; - case GGUF_TYPE_INT32: write_kv_arr_i32(key, arr_type, i, n_arr); break; - case GGUF_TYPE_STRING: write_kv_arr_str(key, arr_type, i, n_arr); break; - default: - throw std::runtime_error(format("cannot recognize array type for key %s\n", key)); - } - } break; - default: - throw std::runtime_error(format("cannot recognize value type for key %s\n", key)); - } - } - } - - info_offset = file.tell(); - - GGML_ASSERT(gguf_get_data_offset(ctx) >= info_offset); - - const size_t count = gguf_get_data_offset(ctx) - info_offset; - - file.write_zeros(count); - file.seek(info_offset, SEEK_SET); - } - - size_t write_tensor_info(gguf_load_tensor & tensor, enum ggml_type type) { - size_t total_written = 0; - file.seek(info_offset, SEEK_SET); - total_written += file.write_str(tensor.name); - - int32_t n_dims = tensor.ne.size(); - total_written += file.write_i32(n_dims); - for (int32_t i = 0; i < n_dims; ++i) { - total_written += file.write_i32(tensor.ne[i]); - } - - total_written += file.write_i32(type); - total_written += file.write_u64(tensor_offset); - info_offset += total_written; // position to write info of the next tensor - - file.seek(0, SEEK_END); - - return total_written; - } - - void write_tensor(gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { - switch (new_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - break; - default: GGML_ASSERT(false); - } - - write_tensor_info(tensor, new_type); - file.write_raw(new_data, new_size); - size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment - size_t pad = padded_size - new_size; - file.write_zeros(pad); - tensor_offset += padded_size; // offset of the next tensor - } -}; - struct llama_model_loader { std::unique_ptr file_loader; gguf_load_tensors_map tensors_map; @@ -897,7 +739,6 @@ struct llama_model_loader { tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); } ggml_set_name(tensor, lt.name.c_str()); - GGML_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor if (backend != GGML_BACKEND_CPU) { ggml_set_no_alloc(ggml_ctx, use_mmap); @@ -3245,7 +3086,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); - gguf_file_saver file_saver(fname_out.c_str(), model_loader->file_loader->gguf_ctx); + + struct gguf_context * ctx_out = gguf_init_empty(); + + // copy the KV pairs from the input file + gguf_set_kv(ctx_out, model_loader->file_loader->gguf_ctx); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); #ifdef GGML_USE_K_QUANTS int n_attention_wv = 0; @@ -3279,6 +3125,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector read_data; std::vector work; + std::vector> work_map(model_loader->tensors_map.tensors.size()); + for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) { read_data.resize(tensor.size); tensor.data = read_data.data(); @@ -3437,12 +3285,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } total_size_org += tensor.size; total_size_new += new_size; - file_saver.write_tensor(tensor, new_type, new_data, new_size); + + // TODO: temp fix until we have stream support in gguf + work_map[idx - 1] = std::vector((char *) new_data, (char *) new_data + new_size); + + gguf_add_tensor_ex(ctx_out, tensor.ggml_tensor, new_type, work_map[idx - 1].data(), new_size); } + gguf_write_to_file(ctx_out, fname_out.c_str()); + gguf_free(ctx_out); + LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + // print histogram for all tensors { int64_t sum_all = 0; for (size_t i = 0; i < hist_all.size(); i++) {