From 224656755309ed86ff820f3451199f3c8ed0d5f8 Mon Sep 17 00:00:00 2001 From: zhentaoyu Date: Fri, 10 Nov 2023 10:11:46 +0800 Subject: [PATCH] [LLM Runtime] Unify KV_cache and Support Batch-dim Process in Beam Search (#583) --- .../llm/runtime/graph/__init__.py | 13 +- .../runtime/graph/application/main_pybind.cpp | 111 ++- .../runtime/graph/application/main_run.cpp | 39 +- .../runtime/graph/application/pybind_gptj.cpp | 37 +- .../llm/runtime/graph/core/ne.h | 2 +- .../llm/runtime/graph/core/ne_layers.c | 45 +- .../llm/runtime/graph/core/ne_layers.h | 10 +- .../graph/models/baichuan/baichuan.cpp | 25 +- .../llm/runtime/graph/models/bloom/bloom.cpp | 25 +- .../runtime/graph/models/chatglm/chatglm.cpp | 26 +- .../runtime/graph/models/chatglm/chatglm2.cpp | 25 +- .../runtime/graph/models/falcon/falcon.cpp | 25 +- .../llm/runtime/graph/models/gptj/gptj.cpp | 100 +-- .../runtime/graph/models/gptneox/gptneox.cpp | 69 +- .../graph/models/gptneox/gptneox_utils.cpp | 123 +-- .../llm/runtime/graph/models/llama/llama.cpp | 29 +- .../graph/models/model_utils/model_types.h | 48 ++ .../graph/models/model_utils/model_utils.cpp | 739 +++++++++++------- .../graph/models/model_utils/model_utils.h | 126 ++- .../llm/runtime/graph/models/mpt/mpt.cpp | 25 +- .../llm/runtime/graph/models/opt/opt.cpp | 25 +- .../graph/models/starcoder/starcoder.cpp | 24 +- tests/test_llm_runtime.py | 42 +- 23 files changed, 1057 insertions(+), 676 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py index aaeab8d16a7..0265809bde9 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/__init__.py +++ b/intel_extension_for_transformers/llm/runtime/graph/__init__.py @@ -97,7 +97,8 @@ def quant_model(self, model_name, model_path, out_path, **kwargs): def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False, **kwargs): if self.model is None: - self.init_from_bin(self.model_type, self.bin_file, **kwargs) + self.init_from_bin(self.model_type, self.bin_file, batch_size=input_ids.shape[0], + **kwargs) self.generate_round = 0 elif not interactive: self.model.reinit() @@ -107,12 +108,13 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa if self.generate_round == 0 and not ignore_prompt: ret = input_ids.tolist() - # TODO support multi batch - assert input_ids.shape[0] == 1, "Unsupport multi-batch input ids." beam_search = False if ("num_beams" in kwargs and kwargs["num_beams"] > 1) and not \ kwargs.get("do_sample", False): beam_search = True + if not beam_search: + # TODO support multi batch + assert input_ids.shape[0] == 1, "Unsupport multi-batch input ids." if streamer: if beam_search: print("ERROR, can not use streamer when use beam search for generation!") @@ -130,7 +132,10 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa ret[0].extend(out) streamer.end() else: - ret[0].extend(self.model.generate_tokens(input_ids = input_ids.tolist()[0])) + response = self.model.generate_tokens(input_ids = input_ids.tolist()) + assert (len(ret) == len(response)) + for i in range(len(response)): + ret[i].extend(response[i]) self.generate_round += 1 return ret diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp index f5e41098c80..b6e4d7378d7 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp @@ -55,13 +55,13 @@ class Model { ~Model() { if (ctx) model_free(ctx); } - void init_model(const std::string& model_path, int n_predict, int batch_size, int ctx_size, int seed, int threads, + void init_model(const std::string& model_path, int n_predict, int n_batch, int ctx_size, int seed, int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p, float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep, int n_discard, - bool shift_roped_k); + bool shift_roped_k, int batch_size, model_vocab::id pad_token); void reinit(); std::vector generate(const std::vector& input_ids); - std::vector generate_tokens(const std::vector& input_ids); + std::vector> generate_tokens(const std::vector>& input_ids); bool is_token_end() { return token_eos; } static int quant_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype, const std::string& alg, int group_size, const std::string& scale_dtype, @@ -86,28 +86,29 @@ class Model { model_token post_process(float* logits); model_token post_greedy_search(float* logits); - std::vector post_beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp, - const int& n_tokens, const int& n_threads); + std::vector> post_beam_search(model_context* lctx, const int& n_predict, + const std::vector& inputs, const int& n_threads); model_token post_sample_top_k_top_p_repeat(float* logits); }; -void Model::init_model(const std::string& model_path, int max_new_tokens, int batch_size, int ctx_size, int seed, +void Model::init_model(const std::string& model_path, int max_new_tokens, int n_batch, int ctx_size, int seed, int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p, float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep, - int n_discard, bool shift_roped_k) { + int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token) { #ifdef MODEL_NAME params.model_name = MODEL_NAME; #endif params.model_arch = model_name_to_arch::init().find(params.model_name); params.model = model_path; params.n_predict = max_new_tokens; - params.n_batch = batch_size; + params.n_batch = n_batch; params.n_ctx = ctx_size; params.seed = seed; params.n_threads = threads; params.repeat_penalty = repetition_penalty; params.beam_size = num_beams; params.do_sample = do_sample; + params.batch_size = batch_size; params.beam_search = (num_beams > 1 && !do_sample) ? true : false; if (params.beam_search) { params.memory_type = KV_MEM_TYPE_F16; // TODO NO MHA IN BEAM SEARCH @@ -133,6 +134,7 @@ void Model::init_model(const std::string& model_path, int max_new_tokens, int ba ctx->generation_conf.min_new_tokens = min_new_tokens; ctx->generation_conf.length_penalty = length_penalty; ctx->generation_conf.do_early_stopping = early_stopping; + if (pad_token != -1) ctx->vocab.pad_token_id = pad_token; } void Model::reinit() { @@ -177,7 +179,18 @@ std::vector Model::generate(const std::vector& input_i NE_ASSERT(("n_discard cannot be used with shift_roped_k!", n_discard == -1 || n_discard == 1)); } } - model_eval(ctx, &curr_input_ids[0], curr_input_ids.size(), n_past, n_total, params.n_threads); + std::vector inputs = {model_input{ + /*.tokens =*/curr_input_ids.data(), + /*.n_tokens =*/(uint32_t)curr_input_ids.size(), + /*.n_prompt_tokens =*/0, + /*.n_past =*/(uint32_t)n_past, + /*.n_total =*/(uint32_t)n_total, + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + model_eval(ctx, inputs.data(), inputs.size(), params.n_threads); n_past += curr_input_ids.size(); n_total += curr_input_ids.size(); @@ -196,18 +209,52 @@ std::vector Model::generate(const std::vector& input_i return {next_token_id}; } -std::vector Model::generate_tokens(const std::vector& input_ids) { +std::vector> Model::generate_tokens(const std::vector>& input_ids) { int n_remain = params.n_predict; std::vector output_ids; + std::vector> rets; + + if (ctx->beam_search) { + MODEL_ASSERT(input_ids.size() == ctx->batch_size); + if (ctx->batch_size > 1 && ctx->vocab.pad_token_id == -1) { + fprintf(stderr, "\nERROR: please set pad_token for beam search multi-batch generation!\n"); + return rets; + } + std::vector inputs; + for (int bs = 0; bs < input_ids.size(); ++bs) { + uint32_t count = 0; + model_vocab::id pad_token_id = ctx->vocab.pad_token_id; + auto iter = std::find_if(input_ids[bs].begin(), input_ids[bs].end(), + [&pad_token_id](model_token t) { return (t != pad_token_id); }); + if (iter == input_ids[bs].end()) fprintf(stderr, "\nERROR: there are all pad tokens in batch %d!\n", bs); + count = std::distance(input_ids[bs].begin(), iter); + inputs.push_back(model_input{ + /*.tokens =*/input_ids[bs].data(), + /*.n_tokens =*/(uint32_t)input_ids[bs].size(), + /*.n_prompt_tokens =*/0, + /*.n_past =*/0, + /*.n_total =*/0, + /*.request_idx =*/bs, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/count, + }); + } + return post_beam_search(ctx, n_remain, inputs, params.n_threads); + } + if (input_ids.size() > 1) { + fprintf(stderr, "\nERROR: Only beam search supports multi-batch generation!\n"); + return rets; + } if (curr_input_ids.empty()) { - if (input_ids.size() > n_ctx - 4) { + if (input_ids[0].size() > n_ctx - 4) { fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__, - input_ids.size(), n_ctx - 4); + input_ids[0].size(), n_ctx - 4); curr_input_ids.resize(n_ctx - 4); - std::copy(input_ids.end() - n_ctx - 4, input_ids.end(), curr_input_ids.begin()); + std::copy(input_ids[0].end() - n_ctx - 4, input_ids[0].end(), curr_input_ids.begin()); } else { - curr_input_ids = input_ids; + curr_input_ids = input_ids[0]; } } @@ -231,11 +278,18 @@ std::vector Model::generate_tokens(const std::vector& NE_ASSERT(("n_discard cannot be used with shift_roped_k!", n_discard == -1 || n_discard == 1)); } } - if (ctx->beam_search) { - output_ids = post_beam_search(ctx, n_remain, curr_input_ids.data(), curr_input_ids.size(), params.n_threads); - break; - } - model_eval(ctx, &curr_input_ids[0], curr_input_ids.size(), n_past, n_total, params.n_threads); + std::vector inputs = {model_input{ + /*.tokens =*/curr_input_ids.data(), + /*.n_tokens =*/(uint32_t)curr_input_ids.size(), + /*.n_prompt_tokens =*/0, + /*.n_past =*/(uint32_t)n_past, + /*.n_total =*/(uint32_t)n_total, + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + model_eval(ctx, inputs.data(), inputs.size(), params.n_threads); n_past += curr_input_ids.size(); n_total += curr_input_ids.size(); @@ -253,8 +307,8 @@ std::vector Model::generate_tokens(const std::vector& break; } } - - return output_ids; + rets.push_back(output_ids); + return rets; } model_token Model::post_greedy_search(float* logits) { @@ -262,16 +316,16 @@ model_token Model::post_greedy_search(float* logits) { return id; } -std::vector Model::post_beam_search(model_context* lctx, const int& n_predict, - const model_token* tokens_inp, const int& n_tokens, - const int& n_threads) { +std::vector> Model::post_beam_search(model_context* lctx, const int& n_predict, + const std::vector& inputs, + const int& n_threads) { // TODO: to implement static std::set supported_archs = {MODEL_GPTJ, MODEL_GPTNEOX}; if (supported_archs.count(params.model_arch) != 0) { - return beam_search(lctx, n_predict, tokens_inp, n_tokens, n_threads); + return beam_search(lctx, n_predict, inputs, n_threads); } else { fprintf(stderr, "\nERROR: this model does not support beam search generation!\n"); - return std::vector(); + return std::vector>(); } } @@ -416,11 +470,12 @@ PYBIND11_MODULE(mistral_cpp, m) py::class_(m, "Model", py::module_local()) .def(py::init()) .def("init_model", &Model::init_model, "initial model with model path and parameters", py::arg("model_path"), - py::arg("max_new_tokens") = -1, py::arg("batch_size") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1, + py::arg("max_new_tokens") = -1, py::arg("n_batch") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1, py::arg("threads") = 8, py::arg("repetition_penalty") = 1.1f, py::arg("num_beams") = 1, py::arg("do_sample") = false, py::arg("top_k") = 40, py::arg("top_p") = 0.95, py::arg("temperature") = 0.8, py::arg("min_new_tokens") = 0, py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false, - py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false) + py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false, + py::arg("batch_size") = 1, py::arg("pad_token") = -1) .def("generate", &Model::generate, "Generate token with input ids", py::arg("input_ids")) .def("generate_tokens", &Model::generate_tokens, "Generate tokens with input ids", py::arg("input_ids")) .def_static("quant_model", &Model::quant_model, "Quantize model", py::arg("model_path"), py::arg("out_path"), diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp index ef43a7dbb34..8bb1cabee8d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp @@ -149,14 +149,36 @@ int main(int argc, char** argv) { if (params.mem_test) { { const std::vector tmp(params.n_batch, ctx->vocab.bos_token_id); - model_eval(ctx, tmp.data(), tmp.size(), 0, 0, params.n_threads); + std::vector inputs = {model_input{ + /*.tokens =*/tmp.data(), + /*.n_tokens =*/(uint32_t)tmp.size(), + /*.n_prompt_tokens =*/0, + /*.n_past =*/0, + /*.n_total =*/0, + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + model_eval(ctx, inputs.data(), inputs.size(), params.n_threads); } { const std::vector tmp = { 0, }; - model_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_predict - 1, params.n_threads); + std::vector inputs = {model_input{ + /*.tokens =*/tmp.data(), + /*.n_tokens =*/(uint32_t)tmp.size(), + /*.n_prompt_tokens =*/0, + /*.n_past =*/(uint32_t)(params.n_predict - 1), + /*.n_total =*/(uint32_t)(params.n_predict - 1), + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + model_eval(ctx, inputs.data(), inputs.size(), params.n_threads); } model_print_timings(ctx); @@ -436,7 +458,18 @@ int main(int argc, char** argv) { if (n_eval > params.n_batch) { n_eval = params.n_batch; } - if (model_eval(ctx, &embd[i], n_eval, n_past, n_total, params.n_threads)) { + std::vector inputs = {model_input{ + /*.tokens =*/&embd[i], + /*.n_tokens =*/(uint32_t)n_eval, + /*.n_prompt_tokens =*/0, + /*.n_past =*/(uint32_t)n_past, + /*.n_total =*/(uint32_t)n_total, + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + if (model_eval(ctx, inputs.data(), inputs.size(), params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index 5e62505be75..93f0e7489c2 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -40,7 +40,16 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval, return 1; } - if (model_eval(ctx, tokens, n_eval, n_past, n_past, n_threads)) { + std::vector inputs = {model_input{ + /*.tokens =*/tokens, + /*.n_tokens =*/static_cast(n_eval), + /*.n_prompt_tokens =*/0, + /*.n_past =*/static_cast(n_past), + /*.n_total =*/static_cast(n_past), + /*.request_idx =*/0, + /*.beam_idx =*/0, + }}; + if (model_eval(ctx, inputs.data(), inputs.size(), n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } @@ -98,7 +107,16 @@ int32_t* eval_gptj_ids(void* ctx, int32_t* embd_inp_ptr, int ind_size, int n_pre bool do_beam_search = lctx->beam_search; if (do_beam_search) { - res = beam_search(lctx, n_predict, embd_inp_ptr, ind_size, n_threads); + std::vector inputs = {model_input{ + /*.tokens =*/embd_inp_ptr, + /*.n_tokens =*/static_cast(ind_size), + /*.n_prompt_tokens =*/0, + /*.n_past =*/0, + /*.n_total =*/0, + /*.request_idx =*/0, + /*.beam_idx =*/0, + }}; + res = beam_search(lctx, n_predict, inputs, n_threads)[0]; } else { std::vector embd_inp(embd_inp_ptr, embd_inp_ptr + ind_size); std::vector embd; @@ -157,7 +175,18 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa bool do_beam_search = lctx->beam_search; if (do_beam_search) { - embd = beam_search(lctx, n_predict, embd_inp.data(), embd_inp.size(), N_threads); + std::vector inputs = {model_input{ + /*.tokens =*/embd_inp.data(), + /*.n_tokens =*/static_cast(embd_inp.size()), + /*.n_prompt_tokens =*/0, + /*.n_past =*/0, + /*.n_total =*/0, + /*.request_idx =*/0, + /*.beam_idx =*/0, + /*.padding_side =*/0, + /*n_padding =*/0, + }}; + embd = beam_search(lctx, n_predict, inputs, N_threads)[0]; for (auto id : embd_inp) { res += model_token_to_str(lctx, id); } @@ -229,7 +258,7 @@ int main(int argc, char* argv[]) { for (auto gptj_in_all : ctxs) { auto res = eval_gptj_char( gptj_in_all, - // "she opened the door and see", + //"she opened the door and see", // "Once upon a time", // "Tell me 10 things about jazz music", // "A spaceship lands on the moon", diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne.h b/intel_extension_for_transformers/llm/runtime/graph/core/ne.h index 4f1d9c21294..6af00376d02 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/ne.h +++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne.h @@ -40,7 +40,7 @@ #define NE_FILE_VERSION 1 #define NE_MAX_DIMS 4 -#define NE_MAX_NODES 4096 +#define NE_MAX_NODES 8192 #define NE_MAX_PARAMS 256 #define NE_MAX_CONTEXTS 64 #define NE_MAX_OPT 4 diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c index 7e4511314df..f4e296b99d5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c +++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c @@ -2832,7 +2832,9 @@ struct ne_tensor* ne_diag(struct ne_context* ctx, struct ne_tensor* a) { // ne_diag_mask_inf -struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace) { +struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace, + int* n_padding, bool padding_left) { + NE_ASSERT(padding_left); bool is_node = false; if (a->grad) { @@ -2843,10 +2845,18 @@ struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor ne_scratch_save(ctx); - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2, NE_SIZE_CALC); + const int bs = a->ne[3]; + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2 + bs, NE_SIZE_CALC); ((int32_t*)b->data)[0] = n_past; ((int32_t*)b->data)[1] = inplace ? 1 : 0; + for (int i = 0; i < bs; ++i) { + if (n_padding == NULL) { + ((int32_t*)b->data)[2 + i] = 0; + } else { + ((int32_t*)b->data)[2 + i] = *(n_padding + i); + } + } ne_scratch_load(ctx); @@ -2859,11 +2869,21 @@ struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor } struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int n_past) { - return ne_diag_mask_inf_impl(ctx, a, n_past, false); + return ne_diag_mask_inf_impl(ctx, a, n_past, false, NULL, true); } struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past) { - return ne_diag_mask_inf_impl(ctx, a, n_past, true); + return ne_diag_mask_inf_impl(ctx, a, n_past, true, NULL, true); +} + +struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, + int* n_padding) { + return ne_diag_mask_inf_impl(ctx, a, n_past, false, n_padding, true); +} + +struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, + int* n_padding) { + return ne_diag_mask_inf_impl(ctx, a, n_past, true, n_padding, true); } // ne_diag_mask_zero @@ -7302,7 +7322,8 @@ static void ne_compute_forward_diag(const struct ne_compute_params* params, cons static void ne_compute_forward_diag_mask_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, const struct ne_tensor* src1, struct ne_tensor* dst, const float value) { assert(src1->type == NE_TYPE_I32); - assert(ne_nelements(src1) == 2); + const int bs = src0->ne[3]; + assert(ne_nelements(src1) == (2 + bs)); const int ith = params->ith; const int nth = params->nth; @@ -7334,6 +7355,18 @@ static void ne_compute_forward_diag_mask_f32(const struct ne_compute_params* par assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); + // mask padding token (padding left) + for (int b = 0; b < bs; b++) { + const int n_padding = ((int32_t*)src1->data)[2 + b]; + if (n_padding == 0) continue; + for (int k = 0; k < (nz / bs); k++) { + for (int j = ith; j < nr; j += nth) { + // it will not affect next token if don't mask the pad_token row + ne_vec_set_f32(n_padding, (float*)((char*)dst->data + b * dst->nb[3] + k * dst->nb[2] + j * dst->nb[1]), value); + } + } + } + for (int k = 0; k < nz; k++) { for (int j = ith; j < nr; j += nth) { for (int i = n_past; i < nc; i++) { @@ -7410,7 +7443,7 @@ static void ne_compute_forward_soft_max_f32(const struct ne_compute_params* para float max = -INFINITY; ne_vec_max_f32(nc, &max, sp); - ne_float sum = 0.0; + ne_float sum = (max == -INFINITY) ? (1.0 * nc) : 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h index 5bc869abc5e..d82874b5539 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h +++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h @@ -44,7 +44,7 @@ #define NE_QNT_VERSION_FACTOR 1000 // do not change this #define NE_MAX_DIMS 4 -#define NE_MAX_NODES 4096 +#define NE_MAX_NODES 8192 #define NE_MAX_PARAMS 256 #define NE_MAX_CONTEXTS 64 #define NE_MAX_OPT 4 @@ -379,6 +379,14 @@ NE_API struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tens // in-place, returns view(a) NE_API struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past); +// set elements above the diagonal and padding tokens to -INF +NE_API struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, + int* n_padding); + +// in-place, returns view(a) +NE_API struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, + int* n_padding); + // set elements above the diagonal to 0 NE_API struct ne_tensor* ne_diag_mask_zero(struct ne_context* ctx, struct ne_tensor* a, int n_past); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp index 1ab7182d483..8f07e5c7db4 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp @@ -41,18 +41,20 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool baichuan_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool baichuan_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -114,7 +116,9 @@ static bool baichuan_model_eval_internal(model_context& lctx, const model_token* } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); int hidden_size = inpL->ne[0]; @@ -329,9 +333,8 @@ static bool baichuan_model_eval_internal(model_context& lctx, const model_token* return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!baichuan_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!baichuan_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp index bca8e2e2faa..7829647d089 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp @@ -40,19 +40,21 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool bloom_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool bloom_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -88,7 +90,9 @@ static bool bloom_model_eval_internal(model_context& lctx, const model_token* to struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } // wte struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -307,9 +311,8 @@ static bool bloom_model_eval_internal(model_context& lctx, const model_token* to return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!bloom_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!bloom_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp index 69d2e6c7b9b..131311eac9d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp @@ -40,21 +40,24 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // static int flag = 0; static int first_tokens_size = 0; -static bool chatglm_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool chatglm_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -68,7 +71,7 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token* const int n_keep = lctx.n_keep; if (flag == 0) { - first_tokens_size = n_tokens; + first_tokens_size = N; flag++; } @@ -95,7 +98,9 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token* struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -304,9 +309,8 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token* return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!chatglm_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!chatglm_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp index 840b23c6c6b..e1240d5e84d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp @@ -41,20 +41,24 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool chatglm_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool chatglm_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const auto& model = lctx.model; const auto& hparams = model.hparams; + const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& kv_self = model.kv_self; @@ -118,7 +122,9 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token* struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); NE_ASSERT(N == inpL->ne[1]); @@ -359,9 +365,8 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token* return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!chatglm_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!chatglm_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp index b5a18717c00..d7d88fe215d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp @@ -40,19 +40,21 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool falcon_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool falcon_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -115,7 +117,9 @@ static bool falcon_model_eval_internal(model_context& lctx, const model_token* t struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } // wte struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -369,9 +373,8 @@ static bool falcon_model_eval_internal(model_context& lctx, const model_token* t return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!falcon_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!falcon_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp index 847dc8c4b59..0e1750b6c45 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp @@ -45,18 +45,29 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool gptj_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool gptj_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); const int batch_size = lctx.batch_size; // num of beams of all batches - const int N = n_tokens; - + MODEL_ASSERT(batch_size == n_input); + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; + const int beam_size = lctx.beam_search ? lctx.beam_size : 1; + std::vector block_ids; + std::vector n_padding; + bool no_padding = true; + for (int i = 0; i < batch_size; ++i) { + block_ids.push_back((inputs + i)->request_idx * beam_size + (inputs + i)->beam_idx); + n_padding.push_back((inputs + i)->n_padding); + if (no_padding && (inputs + i)->n_padding != 0) no_padding = false; + } const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -132,7 +143,7 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size); ne_set_name(embd, "embd"); for (int i = 0; i < batch_size; ++i) { - memcpy(static_cast(embd->data) + i * N, tokens + i * N, N * ne_element_size(embd)); + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); } #ifdef NE_TP_MODEL @@ -189,7 +200,10 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok std::vector Vcur_bs(batch_size); std::vector k_bs(batch_size); std::vector v_bs(batch_size); + // cache = [tokens, beams, requests, layers], + // tokens = [head_dim, head_num, n_ctx] (may different orders) for (int i = 0; i < batch_size; ++i) { + const int block_idx = block_ids[i]; if (run_mha_fp16) { // batch V Vcur_bs[i] = @@ -199,7 +213,7 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok v_bs[i] = ne_view_1d(ctx0, kv_self.v, head_size * n_head * N * 1, (ne_element_size(kv_self.v) * head_size * n_head) * (il * n_ctx * kv_n_ctx_block + n_past) + - i * n_ctx * head_size * n_head * ne_element_size(kv_self.v)); + block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.v)); // batch K Kcur_bs[i] = ne_permute( ctx0, @@ -208,21 +222,25 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok i * ne_element_size(Kcur) * head_size * n_head * N), head_size, n_head, N, 1), 1, 2, 0, 3); - k_bs[i] = ne_view_4d( - ctx0, kv_self.k, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.k), - n_ctx * ne_element_size(kv_self.k) * head_size, n_ctx * ne_element_size(kv_self.k) * head_size * n_head, - ((il * n_ctx) * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block + - i * n_ctx * head_size * n_head * ne_element_size(kv_self.k) + n_past * ne_element_size(kv_self.k))); + k_bs[i] = ne_view_4d(ctx0, kv_self.k, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.k), + n_ctx * ne_element_size(kv_self.k) * head_size, + n_ctx * ne_element_size(kv_self.k) * head_size * n_head, + ((il * n_ctx) * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block + + block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.k) + + n_past * ne_element_size(kv_self.k))); } else { // batch K - Kcur_bs[i] = - ne_view_4d(ctx0, Kcur, head_size, n_head, N, 1, ne_element_size(Kcur) * head_size, - ne_element_size(Kcur) * head_size * n_head, ne_element_size(Kcur) * head_size * n_head * N, - i * ne_element_size(Kcur) * head_size * n_head * N); + Kcur_bs[i] = ne_permute(ctx0, + ne_view_4d(ctx0, Kcur, head_size, n_head, N, 1, ne_element_size(Kcur) * head_size, + ne_element_size(Kcur) * n_embd, ne_element_size(Kcur) * n_embd * N, + i * ne_element_size(Kcur) * n_embd * N), + 0, 2, 1, 3); k_bs[i] = - ne_view_1d(ctx0, kv_self.k, head_size * n_head * N * 1, - (ne_element_size(kv_self.k) * head_size * n_head) * (il * n_ctx * kv_n_ctx_block + n_past) + - i * n_ctx * head_size * n_head * ne_element_size(kv_self.k)); + ne_view_4d(ctx0, kv_self.k, head_size, N, n_head, 1, ne_element_size(kv_self.k) * head_size, + ne_element_size(kv_self.k) * head_size * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, + ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block + + block_idx * n_ctx * n_embd * ne_element_size(kv_self.k) + + head_size * n_past * ne_element_size(kv_self.k))); // batch V Vcur_bs[i] = ne_permute( @@ -232,11 +250,12 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok i * ne_element_size(Vcur) * head_size * n_head * N), head_size, n_head, N, 1), 1, 2, 0, 3); - v_bs[i] = ne_view_4d( - ctx0, kv_self.v, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.v), - n_ctx * ne_element_size(kv_self.v) * head_size, n_ctx * ne_element_size(kv_self.v) * head_size * n_head, - ((il * n_ctx) * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block + - i * n_ctx * head_size * n_head * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v))); + v_bs[i] = ne_view_4d(ctx0, kv_self.v, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.v), + n_ctx * ne_element_size(kv_self.v) * head_size, + n_ctx * ne_element_size(kv_self.v) * head_size * n_head, + ((il * n_ctx) * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block + + block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.v) + + n_past * ne_element_size(kv_self.v))); } ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur_bs[i], k_bs[i])); ne_build_forward_expand(&gf, ne_cpy(ctx0, Vcur_bs[i], v_bs[i])); @@ -305,24 +324,19 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok K = ne_permute(ctx0, K, 0, 2, 1, 3); } } else { - K = ne_view_4d(ctx0, kv_self.k, head_size, n_head, n_cached, batch_size, ne_element_size(kv_self.k) * head_size, - ne_element_size(kv_self.k) * head_size * n_head, - ne_element_size(kv_self.k) * head_size * n_head * n_ctx, - il * n_ctx * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block); + K = model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_size, n_cached, n_head, batch_size, block_ids, il); if (is_ring_full) { + K = ne_permute(ctx0, K, 0, 2, 1, 3); struct ne_tensor* cossin_cache = nullptr; // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in // a single eval execution if (N == 1) cossin_cache = kv_self.cossin; K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache); + K = ne_permute(ctx0, K, 0, 2, 1, 3); } - K = ne_permute(ctx0, K, 0, 2, 1, 3); // split cached V into n_head heads - V = ne_view_4d(ctx0, kv_self.v, n_cached, head_size, n_head, batch_size, n_ctx * ne_element_size(kv_self.v), - n_ctx * ne_element_size(kv_self.v) * head_size, - n_ctx * ne_element_size(kv_self.v) * head_size * n_head, - il * n_ctx * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block); + V = model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_cached, head_size, n_head, batch_size, block_ids, il, false); } ne_set_name(K, "K"); ne_set_name(V, "V"); @@ -365,8 +379,8 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok ne_set_name(KQ_scaled, "KQ_scaled"); // KQ_scaled = mask_past(KQ_scaled) - if (n_total == 0 || !shift_roped_k) { - KQ_scaled = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + if (n_total == 0 || !shift_roped_k || !no_padding) { + KQ_scaled = ne_diag_mask_inf_with_padding_inplace(ctx0, KQ_scaled, n_past, n_padding.data()); ne_set_name(KQ_scaled, "KQ_masked"); } @@ -483,14 +497,11 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok size_t bs_stride = n_vocab * N; if (lctx.logits_all) { logits_out.resize(n_vocab * N * batch_size); - - for (int i = 0; i < batch_size; ++i) { - memcpy(logits_out.data() + i * bs_stride, (float*)ne_get_data(inpL) + (i * bs_stride), - sizeof(float) * n_vocab * N); - } + memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab * N * batch_size); } else { // return result for just the last token logits_out.resize(n_vocab * batch_size); +#pragma omp parallel for for (int i = 0; i < batch_size; ++i) { memcpy(logits_out.data() + (i * n_vocab), (float*)ne_get_data(inpL) + (i * bs_stride) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); @@ -526,9 +537,8 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!gptj_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!gptj_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp index f7e54a9bc83..9727cf6aabb 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp @@ -66,21 +66,33 @@ struct ne_tensor* gpt_neox_ff(const model_layer& layer, const int batch_size, co // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool gptneox_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool gptneox_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; const int kv_n_ctx_block = lctx.kv_n_ctx_block; + const int beam_size = lctx.beam_search ? lctx.beam_size : 1; + std::vector block_ids; + std::vector n_padding; + bool no_padding = true; + for (int i = 0; i < batch_size; ++i) { + block_ids.push_back((inputs + i)->request_idx * beam_size + (inputs + i)->beam_idx); + n_padding.push_back((inputs + i)->n_padding); + if (no_padding && (inputs + i)->n_padding != 0) no_padding = false; + } + const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -141,7 +153,7 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size); ne_set_name(embd, "embd"); for (int i = 0; i < batch_size; ++i) { - memcpy(static_cast(embd->data) + i * N, tokens + i * N, N * ne_element_size(embd)); + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -185,17 +197,19 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* std::vector k_bs(batch_size); std::vector v_bs(batch_size); for (int i = 0; i < batch_size; ++i) { + const int block_idx = block_ids[i]; // batch K Kcur_bs[i] = ne_permute(ctx0, ne_view_4d(ctx0, Kcur, head_dim, n_head, N, 1, ne_element_size(Kcur) * head_dim, ne_element_size(Kcur) * n_embd, ne_element_size(Kcur) * n_embd * N, i * ne_element_size(Kcur) * n_embd * N), 0, 2, 1, 3); - k_bs[i] = ne_view_4d( - ctx0, kv_self.k, head_dim, N, n_head, 1, ne_element_size(kv_self.k) * head_dim, - ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, - ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block + - i * n_ctx * n_embd * ne_element_size(kv_self.k) + head_dim * n_past * ne_element_size(kv_self.k))); + k_bs[i] = + ne_view_4d(ctx0, kv_self.k, head_dim, N, n_head, 1, ne_element_size(kv_self.k) * head_dim, + ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, + ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block + + block_idx * n_ctx * n_embd * ne_element_size(kv_self.k) + + head_dim * n_past * ne_element_size(kv_self.k))); // batch V Vcur_bs[i] = ne_permute(ctx0, @@ -204,11 +218,11 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* i * ne_element_size(Vcur) * n_embd * N), head_dim, n_head, N, 1), 1, 2, 0, 3); - v_bs[i] = - ne_view_4d(ctx0, kv_self.v, N, head_dim, n_head, 1, n_ctx * ne_element_size(kv_self.v), - n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd, - ((il * n_ctx) * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block + - i * n_ctx * n_embd * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v))); + v_bs[i] = ne_view_4d( + ctx0, kv_self.v, N, head_dim, n_head, 1, n_ctx * ne_element_size(kv_self.v), + n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd, + ((il * n_ctx) * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block + + block_idx * n_ctx * n_embd * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v))); ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur_bs[i], k_bs[i])); ne_build_forward_expand(&gf, ne_cpy(ctx0, Vcur_bs[i], v_bs[i])); } @@ -218,9 +232,7 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ne_tensor* K = - ne_view_4d(ctx0, kv_self.k, head_dim, n_past + N, n_head, batch_size, ne_element_size(kv_self.k) * head_dim, - ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, - il * n_ctx * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block); + model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_dim, n_past + N, n_head, batch_size, block_ids, il); // K * Q struct ne_tensor* KQ = ne_mul_mat(ctx0, K, Q); @@ -229,16 +241,14 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* struct ne_tensor* KQ_scaled = ne_scale_inplace(ctx0, KQ, ne_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head))); // KQ_masked = mask_past(KQ_scaled) - struct ne_tensor* KQ_masked = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct ne_tensor* KQ_masked = ne_diag_mask_inf_with_padding_inplace(ctx0, KQ_scaled, n_past, n_padding.data()); // KQ = soft_max(KQ_masked) struct ne_tensor* KQ_soft_max = ne_soft_max_inplace(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ne_tensor* V = - ne_view_4d(ctx0, kv_self.v, n_past + N, head_dim, n_head, batch_size, n_ctx * ne_element_size(kv_self.v), - n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd, - il * n_ctx * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block); + model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_past + N, head_dim, n_head, batch_size, block_ids, il, false); // KQV = transpose(V) * KQ_soft_max struct ne_tensor* KQV = ne_mul_mat(ctx0, V, KQ_soft_max); @@ -366,13 +376,11 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* size_t bs_stride = n_vocab * N; if (lctx.logits_all) { logits_out.resize(n_vocab * N * batch_size); - for (int i = 0; i < batch_size; ++i) { - memcpy(logits_out.data() + i * bs_stride, (float*)ne_get_data(inpL) + (i * bs_stride), - sizeof(float) * n_vocab * N); - } + memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab * N * batch_size); } else { // return result for just the last token logits_out.resize(n_vocab * batch_size); +#pragma omp parallel for for (int i = 0; i < batch_size; ++i) { memcpy(logits_out.data() + (i * n_vocab), (float*)ne_get_data(inpL) + (i * bs_stride) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); @@ -408,9 +416,8 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token* return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!gptneox_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!gptneox_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp index af78fd26f2c..c2d8527b634 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp @@ -39,6 +39,14 @@ #include "models/model_utils/util.h" #include "models/models.h" +void model_load_internal(const std::string& fname, model_archs arch, model_context& lctx, int n_gpu_layers, + bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback, + void* progress_callback_user_data) { + std::unique_ptr ms(new GPTNEOX()); + ms->init(fname.c_str(), lctx, n_gpu_layers, use_mmap, use_mlock, vocab_only); + ms->load(lctx, progress_callback, progress_callback_user_data); +} + void GPTNEOX::init(const char* path_model, model_context& lctx, int n_gpu_layer_, bool use_mmap_, bool use_mlock_, bool vocab_only_) { n_gpu_layer = n_gpu_layer_; @@ -178,118 +186,3 @@ class gptneox_quant_layer : public quant_layer_base { } }; REGISTER_QUANT_LAYER_CLASS(gptneox); - -class gptneox_beam_search_kv_cache_reorder : public beam_search_kv_cache_reorder { - public: - explicit gptneox_beam_search_kv_cache_reorder(model_context* lctx) : beam_search_kv_cache_reorder(lctx) {} - ~gptneox_beam_search_kv_cache_reorder() {} - - virtual void update(const uint32_t& n_past, const uint32_t& n_prompt_tokens, - const std::vector>& kv_reorder_indices = {}, - const std::vector& next_beams = {}) override { - // TODO(Yi): use get_batch_kv_elements_from_gpt_params; - NE_ASSERT(ctx->model.kv_self.k->type != NE_TYPE_JBLAS); - // first step - if (n_past == n_prompt_tokens) { - // cpy batch 1 to all batches -#pragma omp parallel for collapse(3) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // K - for (int j = 1; j < kv_n_ctx_block; ++j) { - // [head_dim, N, n_head] - for (int nh = 0; nh < n_head; ++nh) { - memcpy(static_cast(ctx->model.kv_self.k->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) + - ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx, - static_cast(ctx->model.kv_self.k->data) + - i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx, - ne_element_size(ctx->model.kv_self.k) * head_dim * n_prompt_tokens); - } - } - } -#pragma omp parallel for collapse(3) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // V - for (int j = 1; j < kv_n_ctx_block; ++j) { - // [N, head_dim, n_head] or [N, n_embd] - for (int k = 0; k < n_embd; ++k) { - memcpy(static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + - n_ctx * k * ne_element_size(ctx->model.kv_self.v)), - static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - n_ctx * k * ne_element_size(ctx->model.kv_self.v)), - ne_element_size(ctx->model.kv_self.v) * n_prompt_tokens); - } - } - } - } else if (n_past > n_prompt_tokens) { - // next setp - for (auto t : kv_reorder_indices) { - int cur_id = std::get<0>(t); - int cpy_id = std::get<1>(t); - if (cur_id != cpy_id) { - uint32_t len = next_beams[cur_id].token_ids.size() - 1; - // last token in beam is for next step inference - MODEL_ASSERT(len == n_past - n_prompt_tokens); - size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * head_dim; - size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); - if (len + n_prompt_tokens > n_ctx) { - // all token hidden states cache should be updated - input_token_offset_k = 0; - input_token_offset_v = 0; - len = n_ctx; - } -#pragma omp parallel for collapse(2) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // K - // [head_dim, N, n_head] - for (int nh = 0; nh < n_head; ++nh) { - memcpy(static_cast(ctx->model.kv_self.k->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - cur_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) + - ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx + input_token_offset_k, - static_cast(ctx->model.kv_self.k->data) + - i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + - ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx + input_token_offset_k, - ne_element_size(ctx->model.kv_self.k) * head_dim * len); - } - } -#pragma omp parallel for collapse(2) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // V - // [N, head_dim, n_head] or [N, n_embd] - for (int k = 0; k < n_embd; ++k) { - memcpy(static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - cur_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + - n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), - static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + - n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), - ne_element_size(ctx->model.kv_self.v) * len); - } - } - } - } - } else { - return; - } - } -}; - -void model_load_internal(const std::string& fname, model_archs arch, model_context& lctx, int n_gpu_layers, - bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback, - void* progress_callback_user_data) { - std::unique_ptr ms(new GPTNEOX()); - ms->init(fname.c_str(), lctx, n_gpu_layers, use_mmap, use_mlock, vocab_only); - ms->load(lctx, progress_callback, progress_callback_user_data); - lctx.support_jblas_kv = true; - if (lctx.beam_search) { - lctx.bs_kv_reorder = std::make_shared(&lctx); -#ifdef NE_BEAM_SEARCH_VERBOSE_ON - printf("get GPTNEOX beam search kv cache update function. \n"); -#endif - } -} diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp index 40ac7e9cd58..0a54d34703a 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp @@ -45,22 +45,26 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool llama_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool llama_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; // enforce that the first token is BOS - if (n_total == 0 && tokens[0] != lctx.vocab.bos_token_id) { + if (n_total == 0 && inputs->tokens[0] != lctx.vocab.bos_token_id) { fprintf(stderr, "%s: first token must be BOS\n", __func__); return false; } - const int64_t t_start_us = ne_time_us(); + const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); - const int N = n_tokens; + const int64_t t_start_us = ne_time_us(); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -139,7 +143,9 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, N, NE_SIZE_CALC); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } #ifdef NE_TP_MODEL if (enable_tp) { @@ -439,9 +445,8 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!llama_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!llama_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h index 2c0672dddd4..51a645d2e88 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h @@ -151,6 +151,20 @@ struct model_layer { struct ne_tensor* v_cache; }; +typedef int32_t model_pos; +typedef int32_t model_seq_id; + +struct kv_token_cell { + model_pos pos = -1; // token idx (for rope) + model_pos delta = 0; // token shift delta (pos += delta) +}; + +struct kv_seq_cell { + std::vector token_cells; + model_seq_id seq_id = -1; + bool empty = true; +}; + struct model_kv_cache { struct ne_tensor* k = NULL; struct ne_tensor* v = NULL; @@ -162,6 +176,9 @@ struct model_kv_cache { int n; // number of tokens currently in the cache + bool has_shift = false; // ring-buffer (for too long text generation like streaming-llm) + std::vector seq_cells; + ~model_kv_cache() { if (ctx) { ne_free(ctx); @@ -258,6 +275,13 @@ struct model_context { model_struct model; model_vocab vocab; + // maximum num of bearable requests in current env + int max_request_bs = 32; // TODO + // num of current execution prompts + int request_running_bs = 1; + // length of current execution tokens list + // first token (prefill) generation is equal to `request_running_bs` + // next tokens (decoding) generation may be larger than `request_running_bs`(for example, beam search) int batch_size = 1; bool beam_search = false; bool shift_roped_k = false; // whether to store non-RoPEd K cache @@ -341,6 +365,30 @@ typedef struct model_token_data_array { typedef void (*model_progress_callback)(float progress, void* ctx); +struct model_input { + // embd or next token + const model_token* tokens = nullptr; + // tokens length + uint32_t n_tokens = 0; + // prompt length + uint32_t n_prompt_tokens = 0; + // kv cache n_past + uint32_t n_past = 0; + // text tokens length (prompt + all next tokens) + // the number of tokens evaluated so far (including evicted tokens if there is any) + uint32_t n_total = 0; + // request id + int request_idx = -1; + // beam id in beam search + int beam_idx = 0; + // padding related, attention mask + // (0: left, 1: right) + // only support padding left in decoder only model + int padding_side = 0; + // padding length + uint32_t n_padding = 0; +}; + struct model_context_params { model_archs arch; // arch of models (GPT-J, LLAMA) int n_ctx; // text context diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index ac920f0bc2a..268f9dcb06c 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -77,6 +77,10 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c #endif cache.buf.resize(n_layer * (layer_ne_k + layer_ne_v) * wsize + 2u * MB); + cache.seq_cells.resize(batch_size * beam_size); + for (int i = 0; i < cache.seq_cells.size(); ++i) { + cache.seq_cells[i].token_cells.resize(n_ctx); + } struct ne_init_params params; params.mem_size = cache.buf.size; @@ -2039,11 +2043,127 @@ std::vector>& model_internal_get_tenso return ctx->model.tensors_by_name; } +static void ne_model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src, + const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1) { + const uint32_t kv_n_ctx_block = ctx->kv_n_ctx_block; + const uint32_t n_head = ctx->model.hparams.n_head_kv > 0 ? ctx->model.hparams.n_head_kv : ctx->model.hparams.n_head; + const uint32_t head_dim = ctx->model.hparams.n_embd / ctx->model.hparams.n_head; + const uint32_t n_embd = n_head * head_dim; + const uint32_t n_ctx = ctx->n_ctx; + const size_t k_elem_size = ne_element_size(ctx->model.kv_self.k); + const size_t v_elem_size = ne_element_size(ctx->model.kv_self.v); +#pragma omp parallel for collapse(2) + for (int i = 0; i < ctx->model.layers.size(); ++i) { // K + // [head_dim, N, n_head] + for (int nh = 0; nh < n_head; ++nh) { + memcpy(static_cast(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block + + seq_id_dst * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx + + p0 * k_elem_size * head_dim, + static_cast(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block + + seq_id_src * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx + + p0 * k_elem_size * head_dim, + k_elem_size * head_dim * (p1 - p0 + 1)); + } + } +#pragma omp parallel for collapse(2) + for (int i = 0; i < ctx->model.layers.size(); ++i) { // V + // [N, head_dim, n_head] or [N, n_embd] + for (int nm = 0; nm < n_embd; ++nm) { + memcpy(static_cast(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block + + seq_id_dst * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size, + static_cast(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block + + seq_id_src * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size, + v_elem_size * (p1 - p0 + 1)); + } + } +} + +void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src, const model_seq_id& seq_id_dst, + const model_pos& p0, const model_pos& p1) { + if (ctx->model.kv_self.k->type != NE_TYPE_JBLAS) { + ne_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1); + } else { + return; + // jblas_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1); + } +} + +static ne_tensor* ne_model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx, + struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1, + const int64_t& ne2, const int64_t& ne3, + const std::vector& block_ids, const int& layer_idx, + const bool& concat_k) { + MODEL_ASSERT(ne3 == block_ids.size()); // moctx->batch_size + struct ne_tensor* cache = concat_k ? moctx->model.kv_self.k : moctx->model.kv_self.v; + // K = [head_dim, n_past+N, n_head, batch_size] + // V = [N_past+N, head_dim, n_head, batch_size] + const uint32_t n_embd_kv = concat_k ? ne0 * ne2 : ne1 * ne2; + struct ne_tensor* dst = nullptr; + if (concat_k) { + MODEL_ASSERT(ne1 <= moctx->n_ctx); + } else { + MODEL_ASSERT(ne0 <= moctx->n_ctx); + } + const size_t elem_size = ne_element_size(cache); + const size_t nb1 = concat_k ? elem_size * ne0 : elem_size * moctx->n_ctx; + const size_t nb2 = concat_k ? nb1 * moctx->n_ctx : nb1 * ne1; + const size_t nb3 = nb2 * ne2; + int cont_bs = 1; + int start_idx = block_ids[0]; + int id = 1; + size_t dst_off = 0; + while (id < block_ids.size()) { + if (block_ids[id] - block_ids[id - 1] <= 1) { + cont_bs++; + id++; + continue; + } else { + if (dst == nullptr) { + dst = ne_new_tensor_4d(nectx, cache->type, ne0, ne1, ne2, ne3, NE_SIZE_CALC); + } + struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1, + elem_size * ne0 * ne1 * ne2, dst_off); + dst_off += elem_size * ne0 * ne1 * ne2 * cont_bs; + size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block + + start_idx * moctx->n_ctx * elem_size * n_embd_kv; + ne_build_forward_expand( + cgraph, ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i)); + start_idx = block_ids[id]; + cont_bs = 1; + id++; + } + } + + size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block + + start_idx * moctx->n_ctx * elem_size * n_embd_kv; + if (start_idx == block_ids[0]) { + // continuous among all batch tokens + return ne_view_4d(nectx, cache, ne0, ne1, ne2, ne3, nb1, nb2, nb3, off); + } else { + // last cont batch + struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1, + elem_size * ne0 * ne1 * ne2, dst_off); + ne_build_forward_expand(cgraph, + ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i)); + return dst; + } +} + +ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx, struct ne_context* nectx, + const int64_t& ne0, const int64_t& ne1, const int64_t& ne2, const int64_t& ne3, + const std::vector& block_ids, const int& layer_idx, const bool& concat_k) { + if (moctx->model.kv_self.k->type != NE_TYPE_JBLAS) { + return ne_model_kv_cache_seq_concat(cgraph, moctx, nectx, ne0, ne1, ne2, ne3, block_ids, layer_idx, concat_k); + } else { + return nullptr; // jblas + } +} + // beam search // A struct for calculating logits-related info. struct logits_info { const model_context* const ctx = nullptr; - // (batch, seq_len * vocab_size) batch = input_prompt_bs* beam_size + // [vocab_size * seq_len * batch] batch = input_prompt_bs* beam_size const float* const logits = nullptr; const int batch_size; const int32_t n_vocab; @@ -2116,117 +2236,96 @@ struct logits_info { } }; -void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { +void logits_processor::min_new_tokens_logits_process(const std::vector& cur_lens, + const model_vocab::id& eos_token_id) { MODEL_ASSERT(ctx->generation_conf.min_new_tokens >= 0); - if (ctx->generation_conf.min_new_tokens == 0 || ctx->generation_conf.min_new_tokens <= cur_len) { + if (ctx->generation_conf.min_new_tokens == 0) { return; - } else { - int batch_size = ctx->batch_size; - size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab; - size_t bs_stride = ctx->logits.size() / ctx->batch_size; - for (int i = 0; i < batch_size; ++i) { - // forbidden to choose eos_token if cur_len < min_new_tokens - *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = NEG_INF; + } + int batch_size = ctx->batch_size; + MODEL_ASSERT(batch_size == cur_lens.size()); + size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab; + size_t bs_stride = ctx->logits.size() / ctx->batch_size; + for (int i = 0; i < batch_size; ++i) { + if (ctx->generation_conf.min_new_tokens <= cur_lens[i]) { + continue; } + // forbidden to choose eos_token if cur_len < min_new_tokens + *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = NEG_INF; } } -void logits_processor::process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { +void logits_processor::process(const std::vector& cur_lens, const model_vocab::id& eos_token_id) { MODEL_ASSERT(model_get_logits(ctx) != nullptr); if (min_new_tokens > 0) { - min_new_tokens_logits_process(cur_len, eos_token_id); + min_new_tokens_logits_process(cur_lens, eos_token_id); } } -// TODO dispatch JBLAS kv cache manager -void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t& n_prompt_tokens, +void beam_search_kv_cache_reorder::update(const std::vector& n_past, + const std::vector& n_prompt_tokens, + const std::vector request_running_indices, const std::vector>& kv_reorder_indices, const std::vector& next_beams) { - // TODO(Yi): use get_batch_kv_elements_from_gpt_params; - NE_ASSERT(ctx->model.kv_self.k->type != NE_TYPE_JBLAS); - // first step - if (n_past == n_prompt_tokens) { - // cpy batch 1 to all batches -#pragma omp parallel for collapse(2) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // K - for (int j = 1; j < kv_n_ctx_block; ++j) { - // [n_embd, N] - memcpy(static_cast(ctx->model.kv_self.k->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd), - static_cast(ctx->model.kv_self.k->data) + - i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block, - ne_element_size(ctx->model.kv_self.k) * n_embd * n_prompt_tokens); - } - } -#pragma omp parallel for collapse(3) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // V - for (int j = 1; j < kv_n_ctx_block; ++j) { - // [N, n_embd] - for (int k = 0; k < n_embd; ++k) { - memcpy(static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + - n_ctx * k * ne_element_size(ctx->model.kv_self.v)), - static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - n_ctx * k * ne_element_size(ctx->model.kv_self.v)), - ne_element_size(ctx->model.kv_self.v) * n_prompt_tokens); - } + // TODO beam search unsupport shift kv cache when prompt + new_tokens > nctx + if (ctx->model.kv_self.has_shift) { + fprintf(stderr, "%s: error: unimplement shifted kv cache update\n", __func__); + return; + } +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("start to update kv cache for next step...\n"); +#endif + MODEL_ASSERT(request_running_indices.size() == ctx->request_running_bs); + if (!kv_reorder_indices.empty()) MODEL_ASSERT(kv_reorder_indices.size() == ctx->request_running_bs * ctx->beam_size); + int count = 0; + for (int rb = 0; rb < ctx->request_running_bs; ++rb) { + const int request_idx = request_running_indices[rb]; + const uint32_t off = ctx->beam_size * request_idx; + const uint32_t cur_n_past = n_past[request_idx]; + const uint32_t cur_n_prompt_tokens = n_prompt_tokens[request_idx]; +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("------request_idx: %d, n_past: %d, n_prompt_tokens: %d, offset: %d------ \n", request_idx, cur_n_past, + cur_n_prompt_tokens, off); +#endif + // first step + if (cur_n_past == cur_n_prompt_tokens) { +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("copy beam 1 first token to the left beams\n"); +#endif + for (int b = 1; b < ctx->beam_size; ++b) { + model_kv_cache_seq_cpy(ctx, 0 + off, b + off, 0, cur_n_prompt_tokens); } - } - } else if (n_past > n_prompt_tokens) { - // next setp - for (auto t : kv_reorder_indices) { - int cur_id = std::get<0>(t); - int cpy_id = std::get<1>(t); - if (cur_id != cpy_id) { - uint32_t len = next_beams[cur_id].token_ids.size() - 1; - // last token in beam is for next step inference - MODEL_ASSERT(len == n_past - n_prompt_tokens); - size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * n_embd; - size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); - if (len + n_prompt_tokens > n_ctx) { + } else if (cur_n_past > cur_n_prompt_tokens) { + // next setp + for (int t = 0; t < ctx->beam_size; ++t) { + int cur_id = std::get<0>(kv_reorder_indices[count]); + int cpy_id = std::get<1>(kv_reorder_indices[count]); + count++; + if (cur_id == cpy_id) continue; + model_pos p0 = cur_n_prompt_tokens; + model_pos p1 = cur_n_past; + // TODO too long text + if (cur_n_past > n_ctx) { // all token hidden states cache should be updated - input_token_offset_k = 0; - input_token_offset_v = 0; - len = n_ctx; - } -#pragma omp parallel for - for (int i = 0; i < ctx->model.layers.size(); ++i) { // K - // [n_embd, N] - memcpy(static_cast(ctx->model.kv_self.k->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - cur_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) + - input_token_offset_k, - static_cast(ctx->model.kv_self.k->data) + - i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + input_token_offset_k, - ne_element_size(ctx->model.kv_self.k) * n_embd * len); - } -#pragma omp parallel for collapse(2) - for (int i = 0; i < ctx->model.layers.size(); ++i) { // V - // [N, n_embd] - for (int k = 0; k < n_embd; ++k) { - memcpy(static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - cur_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + - n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), - static_cast(ctx->model.kv_self.v->data) + - (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + - n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), - ne_element_size(ctx->model.kv_self.v) * len); - } + p0 = 0; + p1 = n_ctx; } +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("copy beam %d to beam %d in pos [%d, %d] \n", cpy_id, cur_id, p0, p1); +#endif + model_kv_cache_seq_cpy(ctx, cpy_id + off, cur_id + off, p0, p1); } + } else { + fprintf(stderr, "%s: error: unable to update kv cache\n", __func__); + return; } - } else { - return; } +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); +#endif } -// Return top k token_data by score. (prompt_bs * sample_scale * num_beam) +// Return top k token_data by score. (request_running_bs * sample_scale * num_beam) // each beam gives top_k results --> + prev_scores --> from (num_beam * top_k) sort num_beam // for example, huggingface transformers repo implements like this: // log_softmax(num_beam*n_vocab) -- > + prev_scores --> sort num_beam @@ -2234,19 +2333,24 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // we sample top_k logits for each beam, than compute scores in these logits positions // then we sample top_k results among all beams. // this approach will accelerate sampling speed by log_softmax times reduction -std::vector beam_search_flow::beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, +std::vector beam_search_flow::beam_top_k_next_tokens(model_context* ctx, const std::vector& beams_score, const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale, const int& dim) { - MODEL_ASSERT(dim == -1); // raise unimplemented error - const int request_bs = 1; // TODO ctx->request_running_num + MODEL_ASSERT(dim == -1); // raise unimplemented error + // TODO different requests may have different num_beams (ctx->beam_size >= num_beams[i])? + const int request_running_bs = ctx->request_running_bs; logits_info li(ctx); - lp.process(cur_len, ctx->vocab.eos_token_id); - const int raw_k = sample_scale * beam_size; + std::vector cur_lens; + for (int i = 0; i < ctx->batch_size; ++i) { + cur_lens.push_back(cur_beams[i].token_ids.size()); + } + lp.process(cur_lens, ctx->vocab.eos_token_id); + const int raw_k = sample_scale * (*std::max_element(num_beams.begin(), num_beams.end())); // raw logits top_k std::vector> raw_top_k = li.vocab_top_k(raw_k); - MODEL_ASSERT(raw_top_k.size() == ctx->batch_size); // request_bs * num_beam + MODEL_ASSERT(raw_top_k.size() == ctx->batch_size); // request_running_bs * num_beam MODEL_ASSERT(raw_top_k[0].size() == raw_k); MODEL_ASSERT(beams_score.size() == ctx->batch_size); // compute score: log_softmax + prev_score @@ -2255,18 +2359,17 @@ std::vector beam_search_flow::beam_top_k_next_tokens(model_cont std::for_each(raw_top_k[i].begin(), raw_top_k[i].end(), [&](beam_next_token& r) { r.score = li.log_probability_from_logit(i, r.score) + beams_score[i]; }); } - MODEL_ASSERT(num_beams.size() == request_bs); + MODEL_ASSERT(num_beams.size() == request_running_bs); std::vector res; res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0)); - std::vector min_heap; const uint32_t n_vocab = ctx->model.hparams.n_vocab; size_t row_off = 0; auto comp = [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; }; - for (int i = 0; i < request_bs; ++i) { + for (int i = 0; i < request_running_bs; ++i) { const int num_beam = num_beams[i]; const int sample_k = sample_scale * num_beam; MODEL_ASSERT(raw_k >= sample_k); - min_heap.clear(); + std::vector min_heap; min_heap.reserve(sample_k); for (int j = 0; j < num_beam; ++j) { int n = 0; @@ -2289,7 +2392,7 @@ std::vector beam_search_flow::beam_top_k_next_tokens(model_cont } } } - row_off += i * num_beam; + row_off += num_beam; std::sort(min_heap.begin(), min_heap.end(), [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; }); for (const auto b : min_heap) { @@ -2299,84 +2402,108 @@ std::vector beam_search_flow::beam_top_k_next_tokens(model_cont return res; } -// TODO debug info unify (function ptr?) void beam_search_flow::fill_next_beams_by_top_scores() { auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; }; - std::vector embd_inp; - int record = 0; + std::vector next_inputs; int batch_size = 0; - uint32_t cur_len = 0; + request_running_indices.clear(); std::vector beam_indices; std::vector beams_score; - for (int i = 0; i < beam_size; ++i) { - MODEL_ASSERT(!cur_beams[i].eos()); - if (cur_len != 0) { - MODEL_ASSERT(cur_len == cur_beams[i].token_ids.size()); + // filter cur_beams + for (int i = 0; i < cur_beams.size(); ++i) { + if (cur_beams[i].done) continue; + if (request_running_indices.empty()) { + request_running_indices.push_back(cur_beams[i].request_idx); } else { - cur_len = cur_beams[i].token_ids.size(); + if (request_running_indices.back() != cur_beams[i].request_idx) { + request_running_indices.push_back(cur_beams[i].request_idx); + } } // (batch, 1) - // ordered by infer_bs_id - embd_inp.push_back(cur_beams[i].token_ids.back()); + // ordered by request_idx + next_inputs.push_back(model_input{ + /*.tokens =*/&cur_beams[i].token_ids.back(), + /*.n_tokens =*/1, + /*.n_prompt_tokens =*/n_prompt_tokens[request_running_indices.back()], + /*.n_past =*/n_past[request_running_indices.back()], + /*.n_total =*/n_total[request_running_indices.back()], + /*.request_idx =*/request_running_indices.back(), + /*.beam_idx =*/cur_beams[i].beam_idx, + /*.padding_side =*/padding_side[request_running_indices.back()], + /*n_padding =*/n_padding[request_running_indices.back()], + }); batch_size++; - beam_indices.push_back(i); + beam_indices.push_back(cur_beams[i].beam_idx); beams_score.push_back(cur_beams[i].score); } + MODEL_ASSERT(request_running_indices.size() * beam_size == batch_size); + ctx->batch_size = batch_size; + ctx->request_running_bs = request_running_indices.size(); // DEBUG #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("========================================================================================= \n"); printf("next_tokens for inference: \n"); - for (auto kk : embd_inp) { + printf("request_running_bs: %d, batch_size for inference: %d\n", ctx->request_running_bs, ctx->batch_size); + for (int k = 0; k < next_inputs.size(); ++k) { + model_token kk = *(next_inputs[k].tokens); printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); } printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif - ctx->batch_size = batch_size; - int n_tokens = 1; - model_eval(ctx, embd_inp.data(), n_tokens, n_past, n_total, num_threads); + model_eval(ctx, next_inputs.data(), next_inputs.size(), num_threads); const int sample_scale = 2; + std::vector num_beams(ctx->request_running_bs, beam_size); std::vector next_tokens = - beam_top_k_next_tokens(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale); - + beam_top_k_next_tokens(ctx, beams_score, num_beams, beam_indices, sample_scale); + MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); // request_running_bs * beam_size * sample_scale // DEBUG #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("top_k next_tokens: \n"); - for (auto kk : next_tokens) { - printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, - kk.beam_idx); + int bb = 0; + for (int kk = 0; kk < next_tokens.size(); ++kk) { + if (kk % (beam_size * sample_scale) == 0) printf("------batch_%d------\n", bb++); + printf("%d: %s, score: %10.6f, beam_idx: %d \n", next_tokens[kk].id, + (ctx->vocab.id_to_token.at(next_tokens[kk].id).tok).c_str(), next_tokens[kk].score, + next_tokens[kk].beam_idx); } printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif - MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); - MODEL_ASSERT(next_beams.empty()); - for (int i = 0; i < next_tokens.size(); ++i) { - if (next_tokens[i].id == ctx->vocab.eos_token_id) { - // if beam_token does not belong to top num_beams tokens, it should not be added - bool is_beam_token_worse_than_top_num_beams = i >= beam_size ? true : false; - if (is_beam_token_worse_than_top_num_beams) { - continue; + + const int rb_off = beam_size * sample_scale; + for (int rb = 0; rb < request_running_indices.size(); ++rb) { + int record_push = 0; + for (int nt = 0; nt < rb_off; ++nt) { + int i = rb * rb_off + nt; + int cb_off = next_tokens[i].beam_idx + request_running_indices[rb] * beam_size; + if (next_tokens[i].id == ctx->vocab.eos_token_id) { + // if beam_token does not belong to top num_beams tokens, it should not be added + bool is_beam_token_worse_than_top_num_beams = nt >= beam_size ? true : false; + if (is_beam_token_worse_than_top_num_beams) continue; + // update score with eos next token + cur_beams[cb_off].score = next_tokens[i].score; + beam_hypos[request_running_indices[rb]].add(cur_beams[cb_off], n_prompt_tokens[rb]); + } else { + beam next_beam = cur_beams[cb_off]; + next_beam.token_ids.push_back(next_tokens[i].id); + next_beam.score = next_tokens[i].score; + next_beams[request_running_indices[rb] * beam_size + record_push] = std::move(next_beam); + record_push++; + } + if (record_push == beam_size) { + // sort by beam_idx rather than top-k for kv cache updating + std::sort(next_beams.begin() + request_running_indices[rb] * beam_size, + next_beams.begin() + (request_running_indices[rb] + 1) * beam_size, + [](beam& a, beam& b) { return a.beam_idx < b.beam_idx; }); + break; } - // update score with eos next token - cur_beams[next_tokens[i].beam_idx].score = next_tokens[i].score; - beam_hypos[0].add(cur_beams[next_tokens[i].beam_idx], n_prompt_tokens); - } else { - beam next_beam = cur_beams[next_tokens[i].beam_idx]; - next_beam.token_ids.push_back(next_tokens[i].id); - next_beam.score = next_tokens[i].score; - next_beams.push_back(std::move(next_beam)); - } - if (next_beams.size() == beam_size) { - break; } } - - std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); } // get kv cache reorder indices, -// idx_0: dst_beam batch idx, idx_1: src_beam batch idx +// idx_0: dst_beam idx, idx_1: src_beam idx // for copy predicted past token kv cache // for example: // - c @@ -2389,145 +2516,164 @@ void beam_search_flow::fill_next_beams_by_top_scores() { // kv_cache_reorder_indices = {{0,0}, {1,0}} // if kv_cache_reorder_indices = {0:0, 1:1}, then do not need reorder (cpy) std::vector> beam_search_flow::update_kv_cache_reorder_indices() { - MODEL_ASSERT(next_beams.size() == beam_size); - MODEL_ASSERT(cur_beams.size() == beam_size); // DEBUG #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("kv cache update indices info: \n"); - printf("cur_beams: "); - for (int i = 0; i < beam_size; ++i) { - printf("%d, ", cur_beams[i].infer_bs_id); + printf("cur_beams:\n"); + for (int bb = 0; bb < request_running_indices.size(); ++bb) { + printf("------batch_%d------\n", bb); + for (int i = 0; i < beam_size; ++i) { + printf("%d, ", cur_beams[i + request_running_indices[bb] * beam_size].beam_idx); + } + printf("\n"); } - printf("\n"); - printf("next_beams: "); - for (int i = 0; i < beam_size; ++i) { - printf("%d, ", next_beams[i].infer_bs_id); + printf("next_beams:\n"); + for (int bb = 0; bb < request_running_indices.size(); ++bb) { + printf("------batch_%d------\n", bb); + for (int i = 0; i < beam_size; ++i) { + printf("%d, ", next_beams[i + request_running_indices[bb] * beam_size].beam_idx); + } + printf("\n"); } - printf("\n"); #endif std::vector> kv_reorder_indices; - kv_reorder_indices.reserve(beam_size); - // shuffle beams which are early stopped (eos) - // keep them behind beams which have non-eos - // next_beams infer_bs_id: [0, 1(eos), 2(eos), 3] - > [0, 3, 1(eos), 2(eos)] - std::vector cpy_eos_bs_ids; - std::vector cpy_final_bs_ids; - std::vector nb_eos_ids; + kv_reorder_indices.resize(ctx->request_running_bs * beam_size); std::vector nb_shuffle_ids; - cpy_final_bs_ids.reserve(beam_size); - for (int i = 0; i < beam_size; ++i) { - MODEL_ASSERT(cur_beams[i].infer_bs_id == i); - if (next_beams[i].eos()) { - cpy_eos_bs_ids.push_back(next_beams[i].infer_bs_id); - nb_eos_ids.push_back(i); + for (int rb = 0; rb < request_running_indices.size(); ++rb) { + std::vector cpy_final_bs_ids; + const int rb_off = request_running_indices[rb] * beam_size; + for (int i = 0; i < beam_size; ++i) { + MODEL_ASSERT(cur_beams[i + rb_off].beam_idx == i); + cpy_final_bs_ids.push_back(next_beams[i + rb_off].beam_idx); + // update beam_idx for next token generation + next_beams[i + rb_off].beam_idx = i; + } + // we arrange beams by inference batch indice rather score for memcpy time reduction + // so there will be 2 circumstances (ignore no memcpy : 0,1,2,3 --> 0,1,2,3) + // 1. cpoy former beams into latter beams, like: 0,1,2,3 --> 0,0,0,1 + // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3 + // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order + // so we give the contrary order to beams vector indice, which is: + // if 1, copy order is from tail to head + // if 2, copy order is from head to tail + bool cpy_from_head = true; + int dst_idx_sum = 0; + int src_idx_sum = 0; + for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { + dst_idx_sum += i; + src_idx_sum += cpy_final_bs_ids[i]; + if (src_idx_sum < dst_idx_sum) { + cpy_from_head = false; + break; + } + } + if (cpy_from_head) { + int insert_idx = 0; + for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { + kv_reorder_indices[insert_idx + rb_off] = std::move(std::make_tuple(i, cpy_final_bs_ids[i])); + insert_idx++; + } } else { - cpy_final_bs_ids.push_back(next_beams[i].infer_bs_id); - nb_shuffle_ids.push_back(i); - } - } - cpy_final_bs_ids.insert(cpy_final_bs_ids.end(), cpy_eos_bs_ids.begin(), cpy_eos_bs_ids.end()); - nb_shuffle_ids.insert(nb_shuffle_ids.end(), nb_eos_ids.begin(), nb_eos_ids.end()); - - // update indices and batch ids - for (int i = 0; i < beam_size; ++i) { - // update infer_bs_id before next beam generation - next_beams[nb_shuffle_ids[i]].infer_bs_id = i; - } - // beams should be ordered by batch id - std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); - - // we arrange beams by inference batch indice rather score for memcpy time reduction - // so there will be 2 circumstances (ignore no memcpy : 0,1,2,3 --> 0,1,2,3) - // 1. cpoy former beams into latter beams, like: 0,1,2,3 --> 0,0,0,1 - // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3 - // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order - // so we give the contrary order to beams vector indice, which is: - // if 1, copy order is from tail to head - // if 2, copy order is from head to tail - bool cpy_from_head = true; - int dst_idx_sum = 0; - int src_idx_sum = 0; - for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { - dst_idx_sum += i; - src_idx_sum += cpy_final_bs_ids[i]; - if (src_idx_sum < dst_idx_sum) { - cpy_from_head = false; - break; + int insert_idx = 0; + for (int i = cpy_final_bs_ids.size() - 1; i >= 0; --i) { + kv_reorder_indices[insert_idx + rb_off] = std::move(std::make_tuple(i, cpy_final_bs_ids[i])); + insert_idx++; + } } - } - if (cpy_from_head) { - for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { - kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); + // DEBUG +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("batch_%d cpy_final_bs_ids: ", rb); + for (int i = 0; i < beam_size; ++i) { + printf("%d, ", cpy_final_bs_ids[i]); + } + printf("\n"); + printf("copy order: "); + if (cpy_from_head) { + printf("copy_from_head ---> \n"); + } else { + printf("copy_from_tail <--- \n"); } - } else { - for (int i = cpy_final_bs_ids.size() - 1; i >= 0; --i) { - kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); + if (rb == request_running_indices.size() - 1) { + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); } +#endif } + return kv_reorder_indices; +} - // DEBUG -#ifdef NE_BEAM_SEARCH_VERBOSE_ON - printf("cpy_final_bs_ids: "); - for (int i = 0; i < beam_size; ++i) { - printf("%d, ", cpy_final_bs_ids[i]); - } - printf("\n"); - printf("nb_shuffle_ids: "); - for (int i = 0; i < beam_size; ++i) { - printf("%d, ", nb_shuffle_ids[i]); - } - printf("\n"); - printf("next_beams after: "); - for (int i = 0; i < beam_size; ++i) { - printf("%d, ", next_beams[i].infer_bs_id); +void beam_search_flow::update_status() { + // check if done + next_done_request_ids.clear(); + for (int h = 0; h < beam_hypos.size(); ++h) { + if (requests_done[h]) continue; + const bool enough_new_tokens = (cur_beams[h * beam_size].token_ids.size() == ctx->generation_conf.max_new_tokens); + if (beam_hypos[h].is_done() || enough_new_tokens) { + requests_done[h] = true; + next_done_request_ids.push_back(h); + // mark done beams of current request_idx + // batch reduction + std::for_each(cur_beams.begin() + h * beam_size, cur_beams.begin() + (h + 1) * beam_size, + [&](auto& b) { b.done = true; }); + } } - printf("\n"); - printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); -#endif - return kv_reorder_indices; } // Return beam with highest probability. -const beam& beam_search_flow::finalize() { +const beam& beam_search_flow::finalize(const int& request_idx) { #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("========================================================================================= \n"); - printf("finalize: \n"); + printf("request_idx_%d finalize:\n", request_idx); printf("before: \n"); - for (auto b : beam_hypos[0].beams) { + for (auto b : beam_hypos[request_idx].beams) { b.print(); } printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif - if (!requests_done[0]) { - for (const auto b : cur_beams) { - beam_hypos[0].add(b, n_prompt_tokens); + if (!beam_hypos[request_idx].is_done()) { + for (int i = 0; i < beam_size; ++i) { + beam b = cur_beams[request_idx * beam_size + i]; + beam_hypos[request_idx].add(b, n_prompt_tokens[request_idx]); } #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("after (adding more beams from outside): \n"); - for (auto b : beam_hypos[0].beams) { + for (auto b : beam_hypos[request_idx].beams) { b.print(); } printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); - printf("========================================================================================= \n"); #endif } - return beam_hypos[0].top1(); + const beam& top_b = beam_hypos[request_idx].top1(); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("final beam of request_idx %d:\n", request_idx); + top_b.print(); + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); + printf("========================================================================================= \n"); +#endif + return top_b; } -// TODO batch_size = 1 only -// TODO batch prompt processing -std::vector beam_search_flow::loop(const model_token* tokens_inp, const int& n_tokens, - const int& n_threads) { - if (n_tokens > model_n_ctx(ctx)) { - fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, n_tokens, model_n_ctx(ctx) - 4); - return std::vector(); +const std::vector>& beam_search_flow::loop(const std::vector& inputs, + const int& n_threads) { + // n_past, n_tokens, n_prompt_tokens should be same among batches in static batching inference + n_tokens.assign(request_bs, inputs[0].n_tokens); + if (n_tokens[0] > model_n_ctx(ctx)) { + fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, n_tokens[0], model_n_ctx(ctx) - 4); + return response; } num_threads = n_threads; - n_prompt_tokens = n_tokens; - std::vector beam_search_response; - std::vector embd(tokens_inp, tokens_inp + n_tokens); + n_past.assign(request_bs, 0); + n_prompt_tokens.assign(request_bs, n_tokens[0]); + n_total.assign(request_bs, 0); + for (const auto& input : inputs) { + padding_side.push_back(input.padding_side); + n_padding.push_back(input.n_padding); + } - ctx->batch_size = 1; + ctx->batch_size = request_bs; + ctx->request_running_bs = request_bs; + for (int i = 0; i < request_bs; ++i) { + request_running_indices.push_back(i); + } const uint32_t max_new_tokens = ctx->generation_conf.max_new_tokens; // Loop ends in: 1. all requests done; or 2. reach max_new_tokens length @@ -2536,47 +2682,55 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c if (kv_reorder == nullptr) { kv_reorder = std::make_shared(ctx); #ifdef NE_BEAM_SEARCH_VERBOSE_ON - printf("WARNING: using default kv cache update function. \n"); + printf( + "WARNING: Using default kv cache update function. Ignore this warning if your K shape = [head_dim, N, n_head], " + "V shape = [N, head_dim, n_head]\n"); #endif } - beam_hypos.push_back(beam_hypotheses(ctx)); // TODO ctx->request_running_bs; - requests_done.push_back(false); for (int n = 0; n < max_new_tokens; ++n) { // first step - if (n_past == 0) { - model_eval(ctx, embd.data(), n_tokens, n_past, n_total, num_threads); - n_past += n_tokens; - n_total += n_tokens; - kv_reorder->update(n_past, n_tokens); - std::vector next_tokens = beam_top_k_next_tokens(ctx, 0, {0.0f}, {1}, {0}, beam_size); - MODEL_ASSERT(next_tokens.size() == beam_size); - cur_beams.clear(); + if (n_past[0] == 0) { + model_eval(ctx, inputs.data(), inputs.size(), num_threads); + std::for_each(n_past.begin(), n_past.end(), [&](auto& n) { n += n_tokens[0]; }); + std::for_each(n_total.begin(), n_total.end(), [&](auto& n) { n += n_tokens[0]; }); + kv_reorder->update(n_past, n_prompt_tokens, request_running_indices); + std::vector beam_scores(ctx->batch_size, 0.0f); + std::vector num_beams(ctx->request_running_bs, 1); + std::vector beam_indices(ctx->batch_size, 0); + std::vector next_tokens = + beam_top_k_next_tokens(ctx, beam_scores, num_beams, beam_indices, beam_size); + MODEL_ASSERT(next_tokens.size() == ctx->request_running_bs * beam_size); // DEBUG #ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("========================================================================================== \n"); printf("top_k next_tokens: \n"); - for (auto kk : next_tokens) { - printf("%d: %s, score: %12.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), - kk.score, kk.beam_idx); + int bb = 0; + for (int kk = 0; kk < next_tokens.size(); ++kk) { + if (kk % beam_size == 0) printf("------batch_%d------\n", bb++); + printf("%d: %s, score: %10.6f, beam_idx: %d \n", next_tokens[kk].id, + (ctx->vocab.id_to_token.at(next_tokens[kk].id).tok).c_str(), next_tokens[kk].score, + next_tokens[kk].beam_idx); } printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif - for (int i = 0; i < beam_size; ++i) { - beam b; - b.ctx = ctx; - b.token_ids.push_back(next_tokens[i].id); - b.score = next_tokens[i].score; - b.infer_bs_id = i; - cur_beams.push_back(b); + for (int rb = 0; rb < request_running_indices.size(); ++rb) { + for (int i = 0; i < beam_size; ++i) { + beam b; + b.ctx = ctx; + b.token_ids.push_back(next_tokens[i + rb * beam_size].id); + b.score = next_tokens[i + rb * beam_size].score; + b.beam_idx = i; + b.request_idx = request_running_indices[rb]; + cur_beams[request_running_indices[rb] * beam_size + i] = std::move(b); + } } } else { fill_next_beams_by_top_scores(); std::vector> kv_reorder_indices = update_kv_cache_reorder_indices(); - n_past += 1; - n_total += 1; - kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams); + std::for_each(n_past.begin(), n_past.end(), [&](auto& n) { n++; }); + std::for_each(n_total.begin(), n_total.end(), [&](auto& n) { n++; }); + kv_reorder->update(n_past, n_prompt_tokens, request_running_indices, kv_reorder_indices, next_beams); cur_beams.swap(next_beams); - next_beams.clear(); } // DEBUG: print current beams for this iteration @@ -2590,41 +2744,22 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif - // check if done - for (int h = 0; h < beam_hypos.size(); ++h) { - if (requests_done[h]) { - continue; - } - if (beam_hypos[h].is_done()) { - requests_done[h] = true; - } - } - auto const done_or_not = [](const bool& flag) { return flag; }; - if (std::all_of(requests_done.begin(), requests_done.end(), done_or_not)) { - break; + update_status(); + // collect request final generation result if done + for (const auto& didx : next_done_request_ids) { + const beam& top_b = finalize(didx); + response[didx] = top_b.token_ids; } + // return if all requests done in static batching + if (std::find(requests_done.begin(), requests_done.end(), false) == requests_done.end()) break; } - const beam& top_b = finalize(); - -#ifdef NE_BEAM_SEARCH_VERBOSE_ON // DEBUG: print final beam result - printf("========================================================================================= \n"); - printf("final beam:\n"); - top_b.print(); - printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); - printf("========================================================================================= \n"); -#endif - - beam_search_response.clear(); - for (const auto& id : top_b.token_ids) { - beam_search_response.push_back(id); - } - return beam_search_response; + return response; } -std::vector beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp, - const int& n_tokens, const int& n_threads) { +std::vector> beam_search(model_context* lctx, const int& n_predict, + const std::vector& inputs, const int& n_threads) { lctx->generation_conf.max_new_tokens = n_predict; - beam_search_flow bsf(lctx); - return bsf.loop(tokens_inp, n_tokens, n_threads); + beam_search_flow bsf(lctx, inputs.size()); + return bsf.loop(inputs, n_threads); } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index bb053b48477..dd87e1941bc 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -119,12 +119,12 @@ MODEL_API bool model_save_session_file(struct model_context* ctx, const char* pa size_t n_token_total); // Run the model inference to obtain the logits and probabilities for the next +// model_input has some necessary members for inference (more details please see model_types.h): // token. tokens + n_tokens is the provided batch of new tokens to process // n_past is the offset to which the kv is cached to // n_total is the number of tokens evaluated in previous eval calls // Returns 0 on success -MODEL_API int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads); +MODEL_API int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads); // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens. @@ -259,31 +259,64 @@ MODEL_API const char* model_print_system_info(void); } #endif +/* kv cache utils */ +// kv cache both stores permuted tensor +// k shape is [head_dim, N, n_head] +// v shape is [N, head_dim, n_head] or [N, n_embd] +/* kv cache utils */ + +// copy consecutive tokens from one seq to another +MODEL_API void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src, + const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1); + +// concat several seqs into a continuous batch from kv cache +MODEL_API ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx, + struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1, + const int64_t& ne2, const int64_t& ne3, + const std::vector& block_ids, const int& layer_idx, + const bool& concat_k = true); + /* beam search utils */ #define NEG_INF -std::numeric_limits::max() typedef struct beam_next_token { - model_token id; // token id - float score; // score of the token - int beam_idx; // token in which beam (-1 means unknown) + model_token id = -1; // token id + float score = 0.0f; // score of the token + int beam_idx = -1; // token in which beam (-1 means unknown) } beam_next_token; struct beam { const model_context* ctx = nullptr; std::vector token_ids; // Cumulative beam score (log-softmax here) - float score; - // record inference batch indice - int infer_bs_id; + float score = 0.0f; + // record related indices + // 0 - request_bs-1 + int request_idx = -1; + // 0 - num_beams-1 + int beam_idx = -1; + // if stop generation (append new token_id) + bool done = false; + // end-of-text const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; } + void print() const { - printf("length: %d, score: %12.6f, eos: %d, tokens:\n", token_ids.size(), score, eos()); + printf("length: %d, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(), + score, eos(), request_idx, beam_idx, done); for (const auto& id : token_ids) { printf("%d: %s, ", id, model_token_to_str(ctx, id)); } printf("\n"); } + + void clear() { + token_ids.clear(); + score = 0.0f; + request_idx = -1; + beam_idx = -1; + done = false; + } }; struct beam_hypotheses { @@ -345,6 +378,8 @@ struct beam_hypotheses { auto const by_score = [](beam const& a, beam const& b) { return a.score < b.score; }; return *std::max_element(beams.begin(), beams.end(), by_score); } + + void clear() { beams.clear(); } }; struct logits_info; @@ -354,8 +389,8 @@ class logits_processor { explicit logits_processor(model_context* lctx) : ctx(lctx), min_new_tokens(lctx->generation_conf.min_new_tokens) {} ~logits_processor() {} - void process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); - void min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); + void process(const std::vector& cur_lens, const model_vocab::id& eos_token_id); + void min_new_tokens_logits_process(const std::vector& cur_lens, const model_vocab::id& eos_token_id); private: model_context* ctx = nullptr; @@ -370,66 +405,83 @@ class logits_processor { class beam_search_kv_cache_reorder { public: explicit beam_search_kv_cache_reorder(model_context* lctx) - : ctx(lctx), - n_ctx(lctx->n_ctx), - n_embd(lctx->model.hparams.n_embd), - head_dim(lctx->model.hparams.n_embd / lctx->model.hparams.n_head), - n_head(lctx->model.hparams.n_head), - kv_n_ctx_block(lctx->kv_n_ctx_block) {} - ~beam_search_kv_cache_reorder() {} - - virtual void update(const uint32_t& n_past, const uint32_t& n_prompt_tokens, + : ctx(lctx), n_ctx(lctx->n_ctx), kv_n_ctx_block(lctx->kv_n_ctx_block) {} + virtual ~beam_search_kv_cache_reorder() {} + + virtual void update(const std::vector& n_past, const std::vector& n_prompt_tokens, + const std::vector request_running_indices, const std::vector>& kv_reorder_indices = {}, const std::vector& next_beams = {}); protected: model_context* ctx = nullptr; const uint32_t n_ctx; - const uint32_t n_embd; - // const uint32_t n_head_kv; - const uint32_t head_dim; - const uint32_t n_head; const uint32_t kv_n_ctx_block; }; class beam_search_flow { public: - explicit beam_search_flow(model_context* lctx) : ctx(lctx), beam_size(lctx->beam_size), lp(logits_processor(lctx)) { - cur_beams.reserve(beam_size); - next_beams.reserve(beam_size); - cur_beams.push_back({ctx, {}, 0.0f}); + explicit beam_search_flow(model_context* lctx, const int batch_size = 1) + : ctx(lctx), beam_size(lctx->beam_size), request_bs(batch_size), lp(logits_processor(lctx)) { + cur_beams.resize(batch_size * beam_size); + next_beams.resize(batch_size * beam_size); + for (int i = 0; i < batch_size; ++i) { + beam_hypos.push_back(std::move(beam_hypotheses(lctx))); + } + response.resize(batch_size); + requests_done.assign(batch_size, false); + request_running_indices.reserve(batch_size); + next_done_request_ids.reserve(batch_size); + n_tokens.reserve(batch_size); + n_past.reserve(batch_size); + n_prompt_tokens.reserve(batch_size); + n_total.reserve(batch_size); + padding_side.reserve(batch_size); + n_padding.reserve(batch_size); } ~beam_search_flow() {} // public interface - std::vector loop(const model_token* tokens_inp, const int& n_tokens, const int& n_threads); + // static batching (padding inputs or batch = 1) + const std::vector>& loop(const std::vector& inputs, const int& n_threads); + // continuous batching (scheduling from the outside) + void step(model_token* dst); // TODO one step private: - std::vector beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, - const std::vector& beams_score, + std::vector beam_top_k_next_tokens(model_context* ctx, const std::vector& beams_score, const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale = 2, const int& dim = -1); void fill_next_beams_by_top_scores(); std::vector> update_kv_cache_reorder_indices(); - const beam& finalize(); + void update_status(); + const beam& finalize(const int& request_idx); model_context* ctx = nullptr; const int beam_size; + const int request_bs; // could be the max bs in continuous batching mechanism std::vector cur_beams; std::vector next_beams; std::vector beam_hypos; std::vector requests_done; - uint32_t n_past = 0; - uint32_t n_total = 0; - uint32_t n_prompt_tokens = 0; + std::vector request_running_indices; + std::vector next_done_request_ids; + std::vector n_tokens; + std::vector n_past; + std::vector n_prompt_tokens; + std::vector n_total; + std::vector padding_side; + std::vector n_padding; int num_threads = 4; // default by 4 logits_processor lp; std::shared_ptr kv_reorder; + std::vector> response; }; -MODEL_API std::vector beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp, - const int& n_tokens, const int& n_threads); +// static batching generation +MODEL_API std::vector> beam_search(model_context* lctx, const int& n_predict, + const std::vector& inputs, + const int& n_threads); // Internal API to be implemented by model.cpp and used by tests/benchmarks only #ifdef MODEL_API_INTERNAL diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp index 48987264152..11f651d7031 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp @@ -40,18 +40,20 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool mpt_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool mpt_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; - + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -112,7 +114,9 @@ static bool mpt_model_eval_internal(model_context& lctx, const model_token* toke struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); @@ -352,9 +356,8 @@ static bool mpt_model_eval_internal(model_context& lctx, const model_token* toke return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!mpt_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!mpt_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp index f9122aa5bc8..c2c17d1d1ed 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp @@ -37,18 +37,22 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // #define OPT_POS_EMBD_OFFS 2 -static bool opt_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool opt_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; + const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -84,7 +88,9 @@ static bool opt_model_eval_internal(model_context& lctx, const model_token* toke struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } /* class OPTLearnedPositionalEmbedding(nn.Embedding) attention_mask = attention_mask.long() @@ -366,9 +372,8 @@ static bool opt_model_eval_internal(model_context& lctx, const model_token* toke return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!opt_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!opt_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp index 36ca2545598..d98f6fb3373 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp @@ -40,18 +40,21 @@ // evaluate the transformer // // - lctx: model context -// - tokens: new batch of tokens to process -// - n_past: the offset to which the kv is cached to -// - n_total: the number of tokens evaluated so far (including evicted tokens if there is any) +// - inputs: model_input array +// - n_input num of model_input // - n_threads: number of threads to use // -static bool starcoder_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens, - const int n_past, const int n_total, const int n_threads) { +static bool starcoder_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input, + const int n_threads) { const int64_t t_start_us = ne_time_us(); - const int N = n_tokens; + // TODO static batching for now + const int N = inputs->n_tokens; + const int n_past = inputs->n_past; + const int n_total = inputs->n_total; const int batch_size = lctx.batch_size; + MODEL_ASSERT(batch_size == n_input); const auto& model = lctx.model; const auto& hparams = model.hparams; @@ -113,7 +116,9 @@ static bool starcoder_model_eval_internal(model_context& lctx, const model_token struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); ne_set_name(embd, "embd"); - memcpy(embd->data, tokens, N * ne_element_size(embd)); + for (int i = 0; i < batch_size; ++i) { + memcpy(static_cast(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd)); + } struct ne_tensor* position = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); for (int i = 0; i < N; ++i) { @@ -438,9 +443,8 @@ static bool starcoder_model_eval_internal(model_context& lctx, const model_token return true; } -int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total, - int n_threads) { - if (!starcoder_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) { +int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) { + if (!starcoder_model_eval_internal(*ctx, inputs, n_input, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/tests/test_llm_runtime.py b/tests/test_llm_runtime.py index 4b147bf7f50..315c848224c 100644 --- a/tests/test_llm_runtime.py +++ b/tests/test_llm_runtime.py @@ -4,7 +4,9 @@ import unittest from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig, AutoModelForCausalLM +from intel_extension_for_transformers.llm.runtime.graph.scripts.convert import convert_model +from intel_extension_for_transformers.llm.runtime.graph import Model class TestLLMRUNTIME(unittest.TestCase): @@ -16,6 +18,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls) -> None: shutil.rmtree("./ne_chatglm_q.bin", ignore_errors=True) + shutil.rmtree("./gptj_fp32.bin", ignore_errors=True) def test_llm_runtime(self): @@ -28,4 +31,39 @@ def test_llm_runtime(self): streamer = TextStreamer(tokenizer) model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True, trust_remote_code=True) - gen_tokens = model.generate(input_ids, streamer=streamer, max_new_tokens=300) \ No newline at end of file + gen_tokens = model.generate(input_ids, streamer=streamer, max_new_tokens=300) + + def test_beam_search(self): + model_name = "/tf_dataset2/models/pytorch/gpt-j-6B" # or local path to model + prompts = [ + "she opened the door and see", + "tell me 10 things about jazz music", + "What is the meaning of life?", + "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer"\ + " The slings and arrows of outrageous fortune, "\ + "Or to take arms against a sea of troubles."\ + "And by opposing end them. To die—to sleep," + ] + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, + padding_side="left") + tokenizer.pad_token = tokenizer.eos_token + pad_token = tokenizer(tokenizer.pad_token)['input_ids'][0] + inputs = tokenizer(prompts, padding=True, return_tensors='pt') + + # pytorch fp32 + pt_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) + pt_model.eval() + pt_generate_ids = pt_model.generate(**inputs, max_new_tokens=128, min_new_tokens=30, + early_stopping=True, num_beams=4).tolist() + # llm runtime fp32 + convert_model(model_name, "gptj_fp32.bin", "f32") + itrex_model = Model() + itrex_model.init_from_bin("gptj", "gptj_fp32.bin", batch_size=4, num_beams=4, + max_new_tokens=128, min_new_tokens=30, early_stopping=True, + pad_token=pad_token) + itrex_generate_ids = itrex_model.generate(inputs.input_ids, batch_size=4, num_beams=4, + max_new_tokens=128, min_new_tokens=30, early_stopping=True, + pad_token=pad_token) + for i in range(len(itrex_generate_ids)): + self.assertListEqual(pt_generate_ids[i], itrex_generate_ids[i])