From 224656755309ed86ff820f3451199f3c8ed0d5f8 Mon Sep 17 00:00:00 2001
From: zhentaoyu <zhentao.yu@intel.com>
Date: Fri, 10 Nov 2023 10:11:46 +0800
Subject: [PATCH] [LLM Runtime] Unify KV_cache and Support Batch-dim Process in
 Beam Search (#583)

---
 .../llm/runtime/graph/__init__.py             |  13 +-
 .../runtime/graph/application/main_pybind.cpp | 111 ++-
 .../runtime/graph/application/main_run.cpp    |  39 +-
 .../runtime/graph/application/pybind_gptj.cpp |  37 +-
 .../llm/runtime/graph/core/ne.h               |   2 +-
 .../llm/runtime/graph/core/ne_layers.c        |  45 +-
 .../llm/runtime/graph/core/ne_layers.h        |  10 +-
 .../graph/models/baichuan/baichuan.cpp        |  25 +-
 .../llm/runtime/graph/models/bloom/bloom.cpp  |  25 +-
 .../runtime/graph/models/chatglm/chatglm.cpp  |  26 +-
 .../runtime/graph/models/chatglm/chatglm2.cpp |  25 +-
 .../runtime/graph/models/falcon/falcon.cpp    |  25 +-
 .../llm/runtime/graph/models/gptj/gptj.cpp    | 100 +--
 .../runtime/graph/models/gptneox/gptneox.cpp  |  69 +-
 .../graph/models/gptneox/gptneox_utils.cpp    | 123 +--
 .../llm/runtime/graph/models/llama/llama.cpp  |  29 +-
 .../graph/models/model_utils/model_types.h    |  48 ++
 .../graph/models/model_utils/model_utils.cpp  | 739 +++++++++++-------
 .../graph/models/model_utils/model_utils.h    | 126 ++-
 .../llm/runtime/graph/models/mpt/mpt.cpp      |  25 +-
 .../llm/runtime/graph/models/opt/opt.cpp      |  25 +-
 .../graph/models/starcoder/starcoder.cpp      |  24 +-
 tests/test_llm_runtime.py                     |  42 +-
 23 files changed, 1057 insertions(+), 676 deletions(-)

diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
index aaeab8d16a7..0265809bde9 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/__init__.py
+++ b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
@@ -97,7 +97,8 @@ def quant_model(self, model_name, model_path, out_path, **kwargs):
 
     def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False, **kwargs):
         if self.model is None:
-            self.init_from_bin(self.model_type, self.bin_file, **kwargs)
+            self.init_from_bin(self.model_type, self.bin_file, batch_size=input_ids.shape[0],
+                               **kwargs)
             self.generate_round = 0
         elif not interactive:
             self.model.reinit()
@@ -107,12 +108,13 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
         if self.generate_round == 0 and not ignore_prompt:
             ret = input_ids.tolist()
 
-        # TODO support multi batch
-        assert input_ids.shape[0] == 1, "Unsupport multi-batch input ids."
         beam_search = False
         if ("num_beams" in kwargs and kwargs["num_beams"] > 1) and not \
             kwargs.get("do_sample", False):
             beam_search = True
+        if not beam_search:
+            # TODO support multi batch
+            assert input_ids.shape[0] == 1, "Unsupport multi-batch input ids."
         if streamer:
             if beam_search:
                 print("ERROR, can not use streamer when use beam search for generation!")
@@ -130,7 +132,10 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
                 ret[0].extend(out)
             streamer.end()
         else:
-            ret[0].extend(self.model.generate_tokens(input_ids = input_ids.tolist()[0]))
+            response = self.model.generate_tokens(input_ids = input_ids.tolist())
+            assert (len(ret) == len(response))
+            for i in range(len(response)):
+                ret[i].extend(response[i])
         
         self.generate_round += 1
         return ret
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
index f5e41098c80..b6e4d7378d7 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
@@ -55,13 +55,13 @@ class Model {
   ~Model() {
     if (ctx) model_free(ctx);
   }
-  void init_model(const std::string& model_path, int n_predict, int batch_size, int ctx_size, int seed, int threads,
+  void init_model(const std::string& model_path, int n_predict, int n_batch, int ctx_size, int seed, int threads,
                   float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p, float temperature,
                   int min_new_tokens, float length_penalty, bool early_stopping, int n_keep, int n_discard,
-                  bool shift_roped_k);
+                  bool shift_roped_k, int batch_size, model_vocab::id pad_token);
   void reinit();
   std::vector<model_token> generate(const std::vector<model_token>& input_ids);
-  std::vector<model_token> generate_tokens(const std::vector<model_token>& input_ids);
+  std::vector<std::vector<model_token>> generate_tokens(const std::vector<std::vector<model_token>>& input_ids);
   bool is_token_end() { return token_eos; }
   static int quant_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
                          const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -86,28 +86,29 @@ class Model {
 
   model_token post_process(float* logits);
   model_token post_greedy_search(float* logits);
-  std::vector<model_token> post_beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp,
-                                            const int& n_tokens, const int& n_threads);
+  std::vector<std::vector<model_token>> post_beam_search(model_context* lctx, const int& n_predict,
+                                                         const std::vector<model_input>& inputs, const int& n_threads);
   model_token post_sample_top_k_top_p_repeat(float* logits);
 };
 
-void Model::init_model(const std::string& model_path, int max_new_tokens, int batch_size, int ctx_size, int seed,
+void Model::init_model(const std::string& model_path, int max_new_tokens, int n_batch, int ctx_size, int seed,
                        int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p,
                        float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
-                       int n_discard, bool shift_roped_k) {
+                       int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token) {
 #ifdef MODEL_NAME
   params.model_name = MODEL_NAME;
 #endif
   params.model_arch = model_name_to_arch::init().find(params.model_name);
   params.model = model_path;
   params.n_predict = max_new_tokens;
-  params.n_batch = batch_size;
+  params.n_batch = n_batch;
   params.n_ctx = ctx_size;
   params.seed = seed;
   params.n_threads = threads;
   params.repeat_penalty = repetition_penalty;
   params.beam_size = num_beams;
   params.do_sample = do_sample;
+  params.batch_size = batch_size;
   params.beam_search = (num_beams > 1 && !do_sample) ? true : false;
   if (params.beam_search) {
     params.memory_type = KV_MEM_TYPE_F16;  // TODO NO MHA IN BEAM SEARCH
@@ -133,6 +134,7 @@ void Model::init_model(const std::string& model_path, int max_new_tokens, int ba
   ctx->generation_conf.min_new_tokens = min_new_tokens;
   ctx->generation_conf.length_penalty = length_penalty;
   ctx->generation_conf.do_early_stopping = early_stopping;
+  if (pad_token != -1) ctx->vocab.pad_token_id = pad_token;
 }
 
 void Model::reinit() {
@@ -177,7 +179,18 @@ std::vector<model_token> Model::generate(const std::vector<model_token>& input_i
       NE_ASSERT(("n_discard cannot be used with shift_roped_k!", n_discard == -1 || n_discard == 1));
     }
   }
-  model_eval(ctx, &curr_input_ids[0], curr_input_ids.size(), n_past, n_total, params.n_threads);
+  std::vector<model_input> inputs = {model_input{
+      /*.tokens              =*/curr_input_ids.data(),
+      /*.n_tokens           =*/(uint32_t)curr_input_ids.size(),
+      /*.n_prompt_tokens    =*/0,
+      /*.n_past             =*/(uint32_t)n_past,
+      /*.n_total            =*/(uint32_t)n_total,
+      /*.request_idx        =*/0,
+      /*.beam_idx           =*/0,
+      /*.padding_side       =*/0,
+      /*n_padding           =*/0,
+  }};
+  model_eval(ctx, inputs.data(), inputs.size(), params.n_threads);
   n_past += curr_input_ids.size();
   n_total += curr_input_ids.size();
 
@@ -196,18 +209,52 @@ std::vector<model_token> Model::generate(const std::vector<model_token>& input_i
   return {next_token_id};
 }
 
-std::vector<model_token> Model::generate_tokens(const std::vector<model_token>& input_ids) {
+std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<std::vector<model_token>>& input_ids) {
   int n_remain = params.n_predict;
   std::vector<model_token> output_ids;
+  std::vector<std::vector<model_token>> rets;
+
+  if (ctx->beam_search) {
+    MODEL_ASSERT(input_ids.size() == ctx->batch_size);
+    if (ctx->batch_size > 1 && ctx->vocab.pad_token_id == -1) {
+      fprintf(stderr, "\nERROR: please set pad_token for beam search multi-batch generation!\n");
+      return rets;
+    }
+    std::vector<model_input> inputs;
+    for (int bs = 0; bs < input_ids.size(); ++bs) {
+      uint32_t count = 0;
+      model_vocab::id pad_token_id = ctx->vocab.pad_token_id;
+      auto iter = std::find_if(input_ids[bs].begin(), input_ids[bs].end(),
+                               [&pad_token_id](model_token t) { return (t != pad_token_id); });
+      if (iter == input_ids[bs].end()) fprintf(stderr, "\nERROR: there are all pad tokens in batch %d!\n", bs);
+      count = std::distance(input_ids[bs].begin(), iter);
+      inputs.push_back(model_input{
+          /*.tokens              =*/input_ids[bs].data(),
+          /*.n_tokens           =*/(uint32_t)input_ids[bs].size(),
+          /*.n_prompt_tokens    =*/0,
+          /*.n_past             =*/0,
+          /*.n_total            =*/0,
+          /*.request_idx        =*/bs,
+          /*.beam_idx           =*/0,
+          /*.padding_side       =*/0,
+          /*n_padding           =*/count,
+      });
+    }
+    return post_beam_search(ctx, n_remain, inputs, params.n_threads);
+  }
+  if (input_ids.size() > 1) {
+    fprintf(stderr, "\nERROR: Only beam search supports multi-batch generation!\n");
+    return rets;
+  }
 
   if (curr_input_ids.empty()) {
-    if (input_ids.size() > n_ctx - 4) {
+    if (input_ids[0].size() > n_ctx - 4) {
       fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
-              input_ids.size(), n_ctx - 4);
+              input_ids[0].size(), n_ctx - 4);
       curr_input_ids.resize(n_ctx - 4);
-      std::copy(input_ids.end() - n_ctx - 4, input_ids.end(), curr_input_ids.begin());
+      std::copy(input_ids[0].end() - n_ctx - 4, input_ids[0].end(), curr_input_ids.begin());
     } else {
-      curr_input_ids = input_ids;
+      curr_input_ids = input_ids[0];
     }
   }
 
@@ -231,11 +278,18 @@ std::vector<model_token> Model::generate_tokens(const std::vector<model_token>&
         NE_ASSERT(("n_discard cannot be used with shift_roped_k!", n_discard == -1 || n_discard == 1));
       }
     }
-    if (ctx->beam_search) {
-      output_ids = post_beam_search(ctx, n_remain, curr_input_ids.data(), curr_input_ids.size(), params.n_threads);
-      break;
-    }
-    model_eval(ctx, &curr_input_ids[0], curr_input_ids.size(), n_past, n_total, params.n_threads);
+    std::vector<model_input> inputs = {model_input{
+        /*.tokens              =*/curr_input_ids.data(),
+        /*.n_tokens           =*/(uint32_t)curr_input_ids.size(),
+        /*.n_prompt_tokens    =*/0,
+        /*.n_past             =*/(uint32_t)n_past,
+        /*.n_total            =*/(uint32_t)n_total,
+        /*.request_idx        =*/0,
+        /*.beam_idx           =*/0,
+        /*.padding_side       =*/0,
+        /*n_padding           =*/0,
+    }};
+    model_eval(ctx, inputs.data(), inputs.size(), params.n_threads);
     n_past += curr_input_ids.size();
     n_total += curr_input_ids.size();
 
@@ -253,8 +307,8 @@ std::vector<model_token> Model::generate_tokens(const std::vector<model_token>&
       break;
     }
   }
-
-  return output_ids;
+  rets.push_back(output_ids);
+  return rets;
 }
 
 model_token Model::post_greedy_search(float* logits) {
@@ -262,16 +316,16 @@ model_token Model::post_greedy_search(float* logits) {
   return id;
 }
 
-std::vector<model_token> Model::post_beam_search(model_context* lctx, const int& n_predict,
-                                                 const model_token* tokens_inp, const int& n_tokens,
-                                                 const int& n_threads) {
+std::vector<std::vector<model_token>> Model::post_beam_search(model_context* lctx, const int& n_predict,
+                                                              const std::vector<model_input>& inputs,
+                                                              const int& n_threads) {
   // TODO: to implement
   static std::set<model_archs> supported_archs = {MODEL_GPTJ, MODEL_GPTNEOX};
   if (supported_archs.count(params.model_arch) != 0) {
-    return beam_search(lctx, n_predict, tokens_inp, n_tokens, n_threads);
+    return beam_search(lctx, n_predict, inputs, n_threads);
   } else {
     fprintf(stderr, "\nERROR: this model does not support beam search generation!\n");
-    return std::vector<model_token>();
+    return std::vector<std::vector<model_token>>();
   }
 }
 
@@ -416,11 +470,12 @@ PYBIND11_MODULE(mistral_cpp, m)
   py::class_<Model>(m, "Model", py::module_local())
       .def(py::init())
       .def("init_model", &Model::init_model, "initial model with model path and parameters", py::arg("model_path"),
-           py::arg("max_new_tokens") = -1, py::arg("batch_size") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1,
+           py::arg("max_new_tokens") = -1, py::arg("n_batch") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1,
            py::arg("threads") = 8, py::arg("repetition_penalty") = 1.1f, py::arg("num_beams") = 1,
            py::arg("do_sample") = false, py::arg("top_k") = 40, py::arg("top_p") = 0.95, py::arg("temperature") = 0.8,
            py::arg("min_new_tokens") = 0, py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false,
-           py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false)
+           py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false,
+           py::arg("batch_size") = 1, py::arg("pad_token") = -1)
       .def("generate", &Model::generate, "Generate token with input ids", py::arg("input_ids"))
       .def("generate_tokens", &Model::generate_tokens, "Generate tokens with input ids", py::arg("input_ids"))
       .def_static("quant_model", &Model::quant_model, "Quantize model", py::arg("model_path"), py::arg("out_path"),
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp
index ef43a7dbb34..8bb1cabee8d 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp
@@ -149,14 +149,36 @@ int main(int argc, char** argv) {
   if (params.mem_test) {
     {
       const std::vector<model_token> tmp(params.n_batch, ctx->vocab.bos_token_id);
-      model_eval(ctx, tmp.data(), tmp.size(), 0, 0, params.n_threads);
+      std::vector<model_input> inputs = {model_input{
+          /*.tokens              =*/tmp.data(),
+          /*.n_tokens           =*/(uint32_t)tmp.size(),
+          /*.n_prompt_tokens    =*/0,
+          /*.n_past             =*/0,
+          /*.n_total            =*/0,
+          /*.request_idx        =*/0,
+          /*.beam_idx           =*/0,
+          /*.padding_side       =*/0,
+          /*n_padding           =*/0,
+      }};
+      model_eval(ctx, inputs.data(), inputs.size(), params.n_threads);
     }
 
     {
       const std::vector<model_token> tmp = {
           0,
       };
-      model_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_predict - 1, params.n_threads);
+      std::vector<model_input> inputs = {model_input{
+          /*.tokens              =*/tmp.data(),
+          /*.n_tokens           =*/(uint32_t)tmp.size(),
+          /*.n_prompt_tokens    =*/0,
+          /*.n_past             =*/(uint32_t)(params.n_predict - 1),
+          /*.n_total            =*/(uint32_t)(params.n_predict - 1),
+          /*.request_idx        =*/0,
+          /*.beam_idx           =*/0,
+          /*.padding_side       =*/0,
+          /*n_padding           =*/0,
+      }};
+      model_eval(ctx, inputs.data(), inputs.size(), params.n_threads);
     }
 
     model_print_timings(ctx);
@@ -436,7 +458,18 @@ int main(int argc, char** argv) {
         if (n_eval > params.n_batch) {
           n_eval = params.n_batch;
         }
-        if (model_eval(ctx, &embd[i], n_eval, n_past, n_total, params.n_threads)) {
+        std::vector<model_input> inputs = {model_input{
+            /*.tokens              =*/&embd[i],
+            /*.n_tokens           =*/(uint32_t)n_eval,
+            /*.n_prompt_tokens    =*/0,
+            /*.n_past             =*/(uint32_t)n_past,
+            /*.n_total            =*/(uint32_t)n_total,
+            /*.request_idx        =*/0,
+            /*.beam_idx           =*/0,
+            /*.padding_side       =*/0,
+            /*n_padding           =*/0,
+        }};
+        if (model_eval(ctx, inputs.data(), inputs.size(), params.n_threads)) {
           fprintf(stderr, "%s : failed to eval\n", __func__);
           return 1;
         }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp
index 5e62505be75..93f0e7489c2 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp
@@ -40,7 +40,16 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval,
     return 1;
   }
 
-  if (model_eval(ctx, tokens, n_eval, n_past, n_past, n_threads)) {
+  std::vector<model_input> inputs = {model_input{
+      /*.tokens              =*/tokens,
+      /*.n_tokens           =*/static_cast<uint32_t>(n_eval),
+      /*.n_prompt_tokens    =*/0,
+      /*.n_past             =*/static_cast<uint32_t>(n_past),
+      /*.n_total            =*/static_cast<uint32_t>(n_past),
+      /*.request_idx        =*/0,
+      /*.beam_idx           =*/0,
+  }};
+  if (model_eval(ctx, inputs.data(), inputs.size(), n_threads)) {
     fprintf(stderr, "%s : failed to eval\n", __func__);
     return 1;
   }
@@ -98,7 +107,16 @@ int32_t* eval_gptj_ids(void* ctx, int32_t* embd_inp_ptr, int ind_size, int n_pre
   bool do_beam_search = lctx->beam_search;
 
   if (do_beam_search) {
-    res = beam_search(lctx, n_predict, embd_inp_ptr, ind_size, n_threads);
+    std::vector<model_input> inputs = {model_input{
+        /*.tokens             =*/embd_inp_ptr,
+        /*.n_tokens           =*/static_cast<uint32_t>(ind_size),
+        /*.n_prompt_tokens    =*/0,
+        /*.n_past             =*/0,
+        /*.n_total            =*/0,
+        /*.request_idx        =*/0,
+        /*.beam_idx           =*/0,
+    }};
+    res = beam_search(lctx, n_predict, inputs, n_threads)[0];
   } else {
     std::vector<model_token> embd_inp(embd_inp_ptr, embd_inp_ptr + ind_size);
     std::vector<model_token> embd;
@@ -157,7 +175,18 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa
 
   bool do_beam_search = lctx->beam_search;
   if (do_beam_search) {
-    embd = beam_search(lctx, n_predict, embd_inp.data(), embd_inp.size(), N_threads);
+    std::vector<model_input> inputs = {model_input{
+        /*.tokens             =*/embd_inp.data(),
+        /*.n_tokens           =*/static_cast<uint32_t>(embd_inp.size()),
+        /*.n_prompt_tokens    =*/0,
+        /*.n_past             =*/0,
+        /*.n_total            =*/0,
+        /*.request_idx        =*/0,
+        /*.beam_idx           =*/0,
+        /*.padding_side       =*/0,
+        /*n_padding           =*/0,
+    }};
+    embd = beam_search(lctx, n_predict, inputs, N_threads)[0];
     for (auto id : embd_inp) {
       res += model_token_to_str(lctx, id);
     }
@@ -229,7 +258,7 @@ int main(int argc, char* argv[]) {
   for (auto gptj_in_all : ctxs) {
     auto res = eval_gptj_char(
         gptj_in_all,
-        // "she opened the door and see",
+        //"she opened the door and see",
         // "Once upon a time",
         // "Tell me 10 things about jazz music",
         // "A spaceship lands on the moon",
diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne.h b/intel_extension_for_transformers/llm/runtime/graph/core/ne.h
index 4f1d9c21294..6af00376d02 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/core/ne.h
+++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne.h
@@ -40,7 +40,7 @@
 #define NE_FILE_VERSION 1
 
 #define NE_MAX_DIMS 4
-#define NE_MAX_NODES 4096
+#define NE_MAX_NODES 8192
 #define NE_MAX_PARAMS 256
 #define NE_MAX_CONTEXTS 64
 #define NE_MAX_OPT 4
diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c
index 7e4511314df..f4e296b99d5 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c
+++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.c
@@ -2832,7 +2832,9 @@ struct ne_tensor* ne_diag(struct ne_context* ctx, struct ne_tensor* a) {
 
 // ne_diag_mask_inf
 
-struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace) {
+struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace,
+                                        int* n_padding, bool padding_left) {
+  NE_ASSERT(padding_left);
   bool is_node = false;
 
   if (a->grad) {
@@ -2843,10 +2845,18 @@ struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor
 
   ne_scratch_save(ctx);
 
-  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2, NE_SIZE_CALC);
+  const int bs = a->ne[3];
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2 + bs, NE_SIZE_CALC);
 
   ((int32_t*)b->data)[0] = n_past;
   ((int32_t*)b->data)[1] = inplace ? 1 : 0;
+  for (int i = 0; i < bs; ++i) {
+    if (n_padding == NULL) {
+      ((int32_t*)b->data)[2 + i] = 0;
+    } else {
+      ((int32_t*)b->data)[2 + i] = *(n_padding + i);
+    }
+  }
 
   ne_scratch_load(ctx);
 
@@ -2859,11 +2869,21 @@ struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor
 }
 
 struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
-  return ne_diag_mask_inf_impl(ctx, a, n_past, false);
+  return ne_diag_mask_inf_impl(ctx, a, n_past, false, NULL, true);
 }
 
 struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
-  return ne_diag_mask_inf_impl(ctx, a, n_past, true);
+  return ne_diag_mask_inf_impl(ctx, a, n_past, true, NULL, true);
+}
+
+struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                int* n_padding) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, false, n_padding, true);
+}
+
+struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                        int* n_padding) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, true, n_padding, true);
 }
 
 // ne_diag_mask_zero
@@ -7302,7 +7322,8 @@ static void ne_compute_forward_diag(const struct ne_compute_params* params, cons
 static void ne_compute_forward_diag_mask_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
                                              const struct ne_tensor* src1, struct ne_tensor* dst, const float value) {
   assert(src1->type == NE_TYPE_I32);
-  assert(ne_nelements(src1) == 2);
+  const int bs = src0->ne[3];
+  assert(ne_nelements(src1) == (2 + bs));
 
   const int ith = params->ith;
   const int nth = params->nth;
@@ -7334,6 +7355,18 @@ static void ne_compute_forward_diag_mask_f32(const struct ne_compute_params* par
   assert(dst->nb[0] == sizeof(float));
   assert(src0->nb[0] == sizeof(float));
 
+  // mask padding token (padding left)
+  for (int b = 0; b < bs; b++) {
+    const int n_padding = ((int32_t*)src1->data)[2 + b];
+    if (n_padding == 0) continue;
+    for (int k = 0; k < (nz / bs); k++) {
+      for (int j = ith; j < nr; j += nth) {
+        // it will not affect next token if don't mask the pad_token row
+        ne_vec_set_f32(n_padding, (float*)((char*)dst->data + b * dst->nb[3] + k * dst->nb[2] + j * dst->nb[1]), value);
+      }
+    }
+  }
+
   for (int k = 0; k < nz; k++) {
     for (int j = ith; j < nr; j += nth) {
       for (int i = n_past; i < nc; i++) {
@@ -7410,7 +7443,7 @@ static void ne_compute_forward_soft_max_f32(const struct ne_compute_params* para
     float max = -INFINITY;
     ne_vec_max_f32(nc, &max, sp);
 
-    ne_float sum = 0.0;
+    ne_float sum = (max == -INFINITY) ? (1.0 * nc) : 0.0;
 
     uint16_t scvt;
     for (int i = 0; i < nc; i++) {
diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h
index 5bc869abc5e..d82874b5539 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h
+++ b/intel_extension_for_transformers/llm/runtime/graph/core/ne_layers.h
@@ -44,7 +44,7 @@
 #define NE_QNT_VERSION_FACTOR 1000  // do not change this
 
 #define NE_MAX_DIMS 4
-#define NE_MAX_NODES 4096
+#define NE_MAX_NODES 8192
 #define NE_MAX_PARAMS 256
 #define NE_MAX_CONTEXTS 64
 #define NE_MAX_OPT 4
@@ -379,6 +379,14 @@ NE_API struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tens
 // in-place, returns view(a)
 NE_API struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past);
 
+// set elements above the diagonal and padding tokens to -INF
+NE_API struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                       int* n_padding);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                               int* n_padding);
+
 // set elements above the diagonal to 0
 NE_API struct ne_tensor* ne_diag_mask_zero(struct ne_context* ctx, struct ne_tensor* a, int n_past);
 
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp
index 1ab7182d483..8f07e5c7db4 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/baichuan/baichuan.cpp
@@ -41,18 +41,20 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
-static bool baichuan_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                         const int n_past, const int n_total, const int n_threads) {
+static bool baichuan_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                         const int n_threads) {
   const int64_t t_start_us = ne_time_us();
-
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
 
@@ -114,7 +116,9 @@ static bool baichuan_model_eval_internal(model_context& lctx, const model_token*
   }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
   int hidden_size = inpL->ne[0];
@@ -329,9 +333,8 @@ static bool baichuan_model_eval_internal(model_context& lctx, const model_token*
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!baichuan_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!baichuan_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp
index bca8e2e2faa..7829647d089 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/bloom/bloom.cpp
@@ -40,19 +40,21 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 
-static bool bloom_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                      const int n_past, const int n_total, const int n_threads) {
+static bool bloom_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                      const int n_threads) {
   const int64_t t_start_us = ne_time_us();
-
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -88,7 +90,9 @@ static bool bloom_model_eval_internal(model_context& lctx, const model_token* to
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   // wte
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
@@ -307,9 +311,8 @@ static bool bloom_model_eval_internal(model_context& lctx, const model_token* to
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!bloom_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!bloom_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp
index 69d2e6c7b9b..131311eac9d 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm.cpp
@@ -40,21 +40,24 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 
 static int flag = 0;
 static int first_tokens_size = 0;
-static bool chatglm_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                        const int n_past, const int n_total, const int n_threads) {
+static bool chatglm_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                        const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
 
@@ -68,7 +71,7 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token*
   const int n_keep = lctx.n_keep;
 
   if (flag == 0) {
-    first_tokens_size = n_tokens;
+    first_tokens_size = N;
     flag++;
   }
 
@@ -95,7 +98,9 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token*
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
 
@@ -304,9 +309,8 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token*
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!chatglm_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!chatglm_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp
index 840b23c6c6b..e1240d5e84d 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2.cpp
@@ -41,20 +41,24 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 
-static bool chatglm_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                        const int n_past, const int n_total, const int n_threads) {
+static bool chatglm_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                        const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
+  const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& kv_self = model.kv_self;
 
@@ -118,7 +122,9 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token*
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
 
   NE_ASSERT(N == inpL->ne[1]);
@@ -359,9 +365,8 @@ static bool chatglm_model_eval_internal(model_context& lctx, const model_token*
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!chatglm_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!chatglm_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp
index b5a18717c00..d7d88fe215d 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/falcon/falcon.cpp
@@ -40,19 +40,21 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 
-static bool falcon_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                       const int n_past, const int n_total, const int n_threads) {
+static bool falcon_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                       const int n_threads) {
   const int64_t t_start_us = ne_time_us();
-
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -115,7 +117,9 @@ static bool falcon_model_eval_internal(model_context& lctx, const model_token* t
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   // wte
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
@@ -369,9 +373,8 @@ static bool falcon_model_eval_internal(model_context& lctx, const model_token* t
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!falcon_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!falcon_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp
index 847dc8c4b59..0e1750b6c45 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp
@@ -45,18 +45,29 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
-static bool gptj_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                     const int n_past, const int n_total, const int n_threads) {
+static bool gptj_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                     const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
   const int batch_size = lctx.batch_size;  // num of beams of all batches
-  const int N = n_tokens;
-
+  MODEL_ASSERT(batch_size == n_input);
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
+  const int beam_size = lctx.beam_search ? lctx.beam_size : 1;
+  std::vector<int> block_ids;
+  std::vector<int> n_padding;
+  bool no_padding = true;
+  for (int i = 0; i < batch_size; ++i) {
+    block_ids.push_back((inputs + i)->request_idx * beam_size + (inputs + i)->beam_idx);
+    n_padding.push_back((inputs + i)->n_padding);
+    if (no_padding && (inputs + i)->n_padding != 0) no_padding = false;
+  }
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
 
@@ -132,7 +143,7 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
   ne_set_name(embd, "embd");
   for (int i = 0; i < batch_size; ++i) {
-    memcpy(static_cast<model_token*>(embd->data) + i * N, tokens + i * N, N * ne_element_size(embd));
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
   }
 
 #ifdef NE_TP_MODEL
@@ -189,7 +200,10 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
       std::vector<ne_tensor*> Vcur_bs(batch_size);
       std::vector<ne_tensor*> k_bs(batch_size);
       std::vector<ne_tensor*> v_bs(batch_size);
+      // cache = [tokens, beams, requests, layers],
+      // tokens = [head_dim, head_num, n_ctx] (may different orders)
       for (int i = 0; i < batch_size; ++i) {
+        const int block_idx = block_ids[i];
         if (run_mha_fp16) {
           // batch V
           Vcur_bs[i] =
@@ -199,7 +213,7 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
           v_bs[i] =
               ne_view_1d(ctx0, kv_self.v, head_size * n_head * N * 1,
                          (ne_element_size(kv_self.v) * head_size * n_head) * (il * n_ctx * kv_n_ctx_block + n_past) +
-                             i * n_ctx * head_size * n_head * ne_element_size(kv_self.v));
+                             block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.v));
           // batch K
           Kcur_bs[i] = ne_permute(
               ctx0,
@@ -208,21 +222,25 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
                                        i * ne_element_size(Kcur) * head_size * n_head * N),
                             head_size, n_head, N, 1),
               1, 2, 0, 3);
-          k_bs[i] = ne_view_4d(
-              ctx0, kv_self.k, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.k),
-              n_ctx * ne_element_size(kv_self.k) * head_size, n_ctx * ne_element_size(kv_self.k) * head_size * n_head,
-              ((il * n_ctx) * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block +
-               i * n_ctx * head_size * n_head * ne_element_size(kv_self.k) + n_past * ne_element_size(kv_self.k)));
+          k_bs[i] = ne_view_4d(ctx0, kv_self.k, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.k),
+                               n_ctx * ne_element_size(kv_self.k) * head_size,
+                               n_ctx * ne_element_size(kv_self.k) * head_size * n_head,
+                               ((il * n_ctx) * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block +
+                                block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.k) +
+                                n_past * ne_element_size(kv_self.k)));
         } else {
           // batch K
-          Kcur_bs[i] =
-              ne_view_4d(ctx0, Kcur, head_size, n_head, N, 1, ne_element_size(Kcur) * head_size,
-                         ne_element_size(Kcur) * head_size * n_head, ne_element_size(Kcur) * head_size * n_head * N,
-                         i * ne_element_size(Kcur) * head_size * n_head * N);
+          Kcur_bs[i] = ne_permute(ctx0,
+                                  ne_view_4d(ctx0, Kcur, head_size, n_head, N, 1, ne_element_size(Kcur) * head_size,
+                                             ne_element_size(Kcur) * n_embd, ne_element_size(Kcur) * n_embd * N,
+                                             i * ne_element_size(Kcur) * n_embd * N),
+                                  0, 2, 1, 3);
           k_bs[i] =
-              ne_view_1d(ctx0, kv_self.k, head_size * n_head * N * 1,
-                         (ne_element_size(kv_self.k) * head_size * n_head) * (il * n_ctx * kv_n_ctx_block + n_past) +
-                             i * n_ctx * head_size * n_head * ne_element_size(kv_self.k));
+              ne_view_4d(ctx0, kv_self.k, head_size, N, n_head, 1, ne_element_size(kv_self.k) * head_size,
+                         ne_element_size(kv_self.k) * head_size * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx,
+                         ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block +
+                          block_idx * n_ctx * n_embd * ne_element_size(kv_self.k) +
+                          head_size * n_past * ne_element_size(kv_self.k)));
 
           // batch V
           Vcur_bs[i] = ne_permute(
@@ -232,11 +250,12 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
                                        i * ne_element_size(Vcur) * head_size * n_head * N),
                             head_size, n_head, N, 1),
               1, 2, 0, 3);
-          v_bs[i] = ne_view_4d(
-              ctx0, kv_self.v, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.v),
-              n_ctx * ne_element_size(kv_self.v) * head_size, n_ctx * ne_element_size(kv_self.v) * head_size * n_head,
-              ((il * n_ctx) * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block +
-               i * n_ctx * head_size * n_head * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v)));
+          v_bs[i] = ne_view_4d(ctx0, kv_self.v, N, head_size, n_head, 1, n_ctx * ne_element_size(kv_self.v),
+                               n_ctx * ne_element_size(kv_self.v) * head_size,
+                               n_ctx * ne_element_size(kv_self.v) * head_size * n_head,
+                               ((il * n_ctx) * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block +
+                                block_idx * n_ctx * head_size * n_head * ne_element_size(kv_self.v) +
+                                n_past * ne_element_size(kv_self.v)));
         }
         ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur_bs[i], k_bs[i]));
         ne_build_forward_expand(&gf, ne_cpy(ctx0, Vcur_bs[i], v_bs[i]));
@@ -305,24 +324,19 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
         K = ne_permute(ctx0, K, 0, 2, 1, 3);
       }
     } else {
-      K = ne_view_4d(ctx0, kv_self.k, head_size, n_head, n_cached, batch_size, ne_element_size(kv_self.k) * head_size,
-                     ne_element_size(kv_self.k) * head_size * n_head,
-                     ne_element_size(kv_self.k) * head_size * n_head * n_ctx,
-                     il * n_ctx * ne_element_size(kv_self.k) * head_size * n_head * kv_n_ctx_block);
+      K = model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_size, n_cached, n_head, batch_size, block_ids, il);
       if (is_ring_full) {
+        K = ne_permute(ctx0, K, 0, 2, 1, 3);
         struct ne_tensor* cossin_cache = nullptr;
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in
         // a single eval execution
         if (N == 1) cossin_cache = kv_self.cossin;
         K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache);
+        K = ne_permute(ctx0, K, 0, 2, 1, 3);
       }
-      K = ne_permute(ctx0, K, 0, 2, 1, 3);
 
       // split cached V into n_head heads
-      V = ne_view_4d(ctx0, kv_self.v, n_cached, head_size, n_head, batch_size, n_ctx * ne_element_size(kv_self.v),
-                     n_ctx * ne_element_size(kv_self.v) * head_size,
-                     n_ctx * ne_element_size(kv_self.v) * head_size * n_head,
-                     il * n_ctx * ne_element_size(kv_self.v) * head_size * n_head * kv_n_ctx_block);
+      V = model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_cached, head_size, n_head, batch_size, block_ids, il, false);
     }
     ne_set_name(K, "K");
     ne_set_name(V, "V");
@@ -365,8 +379,8 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
       ne_set_name(KQ_scaled, "KQ_scaled");
 
       // KQ_scaled = mask_past(KQ_scaled)
-      if (n_total == 0 || !shift_roped_k) {
-        KQ_scaled = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+      if (n_total == 0 || !shift_roped_k || !no_padding) {
+        KQ_scaled = ne_diag_mask_inf_with_padding_inplace(ctx0, KQ_scaled, n_past, n_padding.data());
         ne_set_name(KQ_scaled, "KQ_masked");
       }
 
@@ -483,14 +497,11 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
     size_t bs_stride = n_vocab * N;
     if (lctx.logits_all) {
       logits_out.resize(n_vocab * N * batch_size);
-
-      for (int i = 0; i < batch_size; ++i) {
-        memcpy(logits_out.data() + i * bs_stride, (float*)ne_get_data(inpL) + (i * bs_stride),
-               sizeof(float) * n_vocab * N);
-      }
+      memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab * N * batch_size);
     } else {
       // return result for just the last token
       logits_out.resize(n_vocab * batch_size);
+#pragma omp parallel for
       for (int i = 0; i < batch_size; ++i) {
         memcpy(logits_out.data() + (i * n_vocab), (float*)ne_get_data(inpL) + (i * bs_stride) + (n_vocab * (N - 1)),
                sizeof(float) * n_vocab);
@@ -526,9 +537,8 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!gptj_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!gptj_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp
index f7e54a9bc83..9727cf6aabb 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox.cpp
@@ -66,21 +66,33 @@ struct ne_tensor* gpt_neox_ff(const model_layer& layer, const int batch_size, co
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 
-static bool gptneox_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                        const int n_past, const int n_total, const int n_threads) {
+static bool gptneox_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                        const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
   const int kv_n_ctx_block = lctx.kv_n_ctx_block;
 
+  const int beam_size = lctx.beam_search ? lctx.beam_size : 1;
+  std::vector<int> block_ids;
+  std::vector<int> n_padding;
+  bool no_padding = true;
+  for (int i = 0; i < batch_size; ++i) {
+    block_ids.push_back((inputs + i)->request_idx * beam_size + (inputs + i)->beam_idx);
+    n_padding.push_back((inputs + i)->n_padding);
+    if (no_padding && (inputs + i)->n_padding != 0) no_padding = false;
+  }
+
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
 
@@ -141,7 +153,7 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
   ne_set_name(embd, "embd");
   for (int i = 0; i < batch_size; ++i) {
-    memcpy(static_cast<model_token*>(embd->data) + i * N, tokens + i * N, N * ne_element_size(embd));
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
   }
 
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
@@ -185,17 +197,19 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
           std::vector<ne_tensor*> k_bs(batch_size);
           std::vector<ne_tensor*> v_bs(batch_size);
           for (int i = 0; i < batch_size; ++i) {
+            const int block_idx = block_ids[i];
             // batch K
             Kcur_bs[i] = ne_permute(ctx0,
                                     ne_view_4d(ctx0, Kcur, head_dim, n_head, N, 1, ne_element_size(Kcur) * head_dim,
                                                ne_element_size(Kcur) * n_embd, ne_element_size(Kcur) * n_embd * N,
                                                i * ne_element_size(Kcur) * n_embd * N),
                                     0, 2, 1, 3);
-            k_bs[i] = ne_view_4d(
-                ctx0, kv_self.k, head_dim, N, n_head, 1, ne_element_size(kv_self.k) * head_dim,
-                ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx,
-                ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block +
-                 i * n_ctx * n_embd * ne_element_size(kv_self.k) + head_dim * n_past * ne_element_size(kv_self.k)));
+            k_bs[i] =
+                ne_view_4d(ctx0, kv_self.k, head_dim, N, n_head, 1, ne_element_size(kv_self.k) * head_dim,
+                           ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx,
+                           ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block +
+                            block_idx * n_ctx * n_embd * ne_element_size(kv_self.k) +
+                            head_dim * n_past * ne_element_size(kv_self.k)));
 
             // batch V
             Vcur_bs[i] = ne_permute(ctx0,
@@ -204,11 +218,11 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
                                                              i * ne_element_size(Vcur) * n_embd * N),
                                                   head_dim, n_head, N, 1),
                                     1, 2, 0, 3);
-            v_bs[i] =
-                ne_view_4d(ctx0, kv_self.v, N, head_dim, n_head, 1, n_ctx * ne_element_size(kv_self.v),
-                           n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd,
-                           ((il * n_ctx) * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block +
-                            i * n_ctx * n_embd * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v)));
+            v_bs[i] = ne_view_4d(
+                ctx0, kv_self.v, N, head_dim, n_head, 1, n_ctx * ne_element_size(kv_self.v),
+                n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd,
+                ((il * n_ctx) * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block +
+                 block_idx * n_ctx * n_embd * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v)));
             ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur_bs[i], k_bs[i]));
             ne_build_forward_expand(&gf, ne_cpy(ctx0, Vcur_bs[i], v_bs[i]));
           }
@@ -218,9 +232,7 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
 
         // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
         struct ne_tensor* K =
-            ne_view_4d(ctx0, kv_self.k, head_dim, n_past + N, n_head, batch_size, ne_element_size(kv_self.k) * head_dim,
-                       ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx,
-                       il * n_ctx * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block);
+            model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_dim, n_past + N, n_head, batch_size, block_ids, il);
 
         // K * Q
         struct ne_tensor* KQ = ne_mul_mat(ctx0, K, Q);
@@ -229,16 +241,14 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
         struct ne_tensor* KQ_scaled = ne_scale_inplace(ctx0, KQ, ne_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
 
         // KQ_masked = mask_past(KQ_scaled)
-        struct ne_tensor* KQ_masked = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+        struct ne_tensor* KQ_masked = ne_diag_mask_inf_with_padding_inplace(ctx0, KQ_scaled, n_past, n_padding.data());
 
         // KQ = soft_max(KQ_masked)
         struct ne_tensor* KQ_soft_max = ne_soft_max_inplace(ctx0, KQ_masked);
 
         // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
         struct ne_tensor* V =
-            ne_view_4d(ctx0, kv_self.v, n_past + N, head_dim, n_head, batch_size, n_ctx * ne_element_size(kv_self.v),
-                       n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd,
-                       il * n_ctx * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block);
+            model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_past + N, head_dim, n_head, batch_size, block_ids, il, false);
 
         // KQV = transpose(V) * KQ_soft_max
         struct ne_tensor* KQV = ne_mul_mat(ctx0, V, KQ_soft_max);
@@ -366,13 +376,11 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
     size_t bs_stride = n_vocab * N;
     if (lctx.logits_all) {
       logits_out.resize(n_vocab * N * batch_size);
-      for (int i = 0; i < batch_size; ++i) {
-        memcpy(logits_out.data() + i * bs_stride, (float*)ne_get_data(inpL) + (i * bs_stride),
-               sizeof(float) * n_vocab * N);
-      }
+      memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab * N * batch_size);
     } else {
       // return result for just the last token
       logits_out.resize(n_vocab * batch_size);
+#pragma omp parallel for
       for (int i = 0; i < batch_size; ++i) {
         memcpy(logits_out.data() + (i * n_vocab), (float*)ne_get_data(inpL) + (i * bs_stride) + (n_vocab * (N - 1)),
                sizeof(float) * n_vocab);
@@ -408,9 +416,8 @@ static bool gptneox_model_eval_internal(model_context& lctx, const model_token*
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!gptneox_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!gptneox_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp
index af78fd26f2c..c2d8527b634 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptneox/gptneox_utils.cpp
@@ -39,6 +39,14 @@
 #include "models/model_utils/util.h"
 #include "models/models.h"
 
+void model_load_internal(const std::string& fname, model_archs arch, model_context& lctx, int n_gpu_layers,
+                         bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
+                         void* progress_callback_user_data) {
+  std::unique_ptr<GPTNEOX> ms(new GPTNEOX());
+  ms->init(fname.c_str(), lctx, n_gpu_layers, use_mmap, use_mlock, vocab_only);
+  ms->load(lctx, progress_callback, progress_callback_user_data);
+}
+
 void GPTNEOX::init(const char* path_model, model_context& lctx, int n_gpu_layer_, bool use_mmap_, bool use_mlock_,
                    bool vocab_only_) {
   n_gpu_layer = n_gpu_layer_;
@@ -178,118 +186,3 @@ class gptneox_quant_layer : public quant_layer_base {
   }
 };
 REGISTER_QUANT_LAYER_CLASS(gptneox);
-
-class gptneox_beam_search_kv_cache_reorder : public beam_search_kv_cache_reorder {
- public:
-  explicit gptneox_beam_search_kv_cache_reorder(model_context* lctx) : beam_search_kv_cache_reorder(lctx) {}
-  ~gptneox_beam_search_kv_cache_reorder() {}
-
-  virtual void update(const uint32_t& n_past, const uint32_t& n_prompt_tokens,
-                      const std::vector<std::tuple<int, int>>& kv_reorder_indices = {},
-                      const std::vector<beam>& next_beams = {}) override {
-    // TODO(Yi): use get_batch_kv_elements_from_gpt_params;
-    NE_ASSERT(ctx->model.kv_self.k->type != NE_TYPE_JBLAS);
-    // first step
-    if (n_past == n_prompt_tokens) {
-      // cpy batch 1 to all batches
-#pragma omp parallel for collapse(3)
-      for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
-        for (int j = 1; j < kv_n_ctx_block; ++j) {
-          // [head_dim, N, n_head]
-          for (int nh = 0; nh < n_head; ++nh) {
-            memcpy(static_cast<char*>(ctx->model.kv_self.k->data) +
-                       (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                        j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) +
-                       ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx,
-                   static_cast<char*>(ctx->model.kv_self.k->data) +
-                       i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                       ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx,
-                   ne_element_size(ctx->model.kv_self.k) * head_dim * n_prompt_tokens);
-          }
-        }
-      }
-#pragma omp parallel for collapse(3)
-      for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
-        for (int j = 1; j < kv_n_ctx_block; ++j) {
-          // [N, head_dim, n_head] or [N, n_embd]
-          for (int k = 0; k < n_embd; ++k) {
-            memcpy(static_cast<char*>(ctx->model.kv_self.v->data) +
-                       (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                        j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd +
-                        n_ctx * k * ne_element_size(ctx->model.kv_self.v)),
-                   static_cast<char*>(ctx->model.kv_self.v->data) +
-                       (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                        n_ctx * k * ne_element_size(ctx->model.kv_self.v)),
-                   ne_element_size(ctx->model.kv_self.v) * n_prompt_tokens);
-          }
-        }
-      }
-    } else if (n_past > n_prompt_tokens) {
-      // next setp
-      for (auto t : kv_reorder_indices) {
-        int cur_id = std::get<0>(t);
-        int cpy_id = std::get<1>(t);
-        if (cur_id != cpy_id) {
-          uint32_t len = next_beams[cur_id].token_ids.size() - 1;
-          // last token in beam is for next step inference
-          MODEL_ASSERT(len == n_past - n_prompt_tokens);
-          size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * head_dim;
-          size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v);
-          if (len + n_prompt_tokens > n_ctx) {
-            // all token hidden states cache should be updated
-            input_token_offset_k = 0;
-            input_token_offset_v = 0;
-            len = n_ctx;
-          }
-#pragma omp parallel for collapse(2)
-          for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
-            // [head_dim, N, n_head]
-            for (int nh = 0; nh < n_head; ++nh) {
-              memcpy(static_cast<char*>(ctx->model.kv_self.k->data) +
-                         (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                          cur_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) +
-                         ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx + input_token_offset_k,
-                     static_cast<char*>(ctx->model.kv_self.k->data) +
-                         i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                         cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd +
-                         ne_element_size(ctx->model.kv_self.k) * nh * head_dim * n_ctx + input_token_offset_k,
-                     ne_element_size(ctx->model.kv_self.k) * head_dim * len);
-            }
-          }
-#pragma omp parallel for collapse(2)
-          for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
-            // [N, head_dim, n_head] or [N, n_embd]
-            for (int k = 0; k < n_embd; ++k) {
-              memcpy(static_cast<char*>(ctx->model.kv_self.v->data) +
-                         (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                          cur_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd +
-                          n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v),
-                     static_cast<char*>(ctx->model.kv_self.v->data) +
-                         (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                          cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd +
-                          n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v),
-                     ne_element_size(ctx->model.kv_self.v) * len);
-            }
-          }
-        }
-      }
-    } else {
-      return;
-    }
-  }
-};
-
-void model_load_internal(const std::string& fname, model_archs arch, model_context& lctx, int n_gpu_layers,
-                         bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
-                         void* progress_callback_user_data) {
-  std::unique_ptr<GPTNEOX> ms(new GPTNEOX());
-  ms->init(fname.c_str(), lctx, n_gpu_layers, use_mmap, use_mlock, vocab_only);
-  ms->load(lctx, progress_callback, progress_callback_user_data);
-  lctx.support_jblas_kv = true;
-  if (lctx.beam_search) {
-    lctx.bs_kv_reorder = std::make_shared<gptneox_beam_search_kv_cache_reorder>(&lctx);
-#ifdef NE_BEAM_SEARCH_VERBOSE_ON
-    printf("get GPTNEOX beam search kv cache update function. \n");
-#endif
-  }
-}
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp
index 40ac7e9cd58..0a54d34703a 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp
@@ -45,22 +45,26 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
-static bool llama_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                      const int n_past, const int n_total, const int n_threads) {
+static bool llama_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                      const int n_threads) {
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
   // enforce that the first token is BOS
-  if (n_total == 0 && tokens[0] != lctx.vocab.bos_token_id) {
+  if (n_total == 0 && inputs->tokens[0] != lctx.vocab.bos_token_id) {
     fprintf(stderr, "%s: first token must be BOS\n", __func__);
     return false;
   }
 
-  const int64_t t_start_us = ne_time_us();
+  const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
-  const int N = n_tokens;
+  const int64_t t_start_us = ne_time_us();
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -139,7 +143,9 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
 
   struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, N, NE_SIZE_CALC);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
 #ifdef NE_TP_MODEL
   if (enable_tp) {
@@ -439,9 +445,8 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!llama_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!llama_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h
index 2c0672dddd4..51a645d2e88 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h
@@ -151,6 +151,20 @@ struct model_layer {
   struct ne_tensor* v_cache;
 };
 
+typedef int32_t model_pos;
+typedef int32_t model_seq_id;
+
+struct kv_token_cell {
+  model_pos pos = -1;   // token idx (for rope)
+  model_pos delta = 0;  // token shift delta (pos += delta)
+};
+
+struct kv_seq_cell {
+  std::vector<kv_token_cell> token_cells;
+  model_seq_id seq_id = -1;
+  bool empty = true;
+};
+
 struct model_kv_cache {
   struct ne_tensor* k = NULL;
   struct ne_tensor* v = NULL;
@@ -162,6 +176,9 @@ struct model_kv_cache {
 
   int n;  // number of tokens currently in the cache
 
+  bool has_shift = false;  // ring-buffer (for too long text generation like streaming-llm)
+  std::vector<kv_seq_cell> seq_cells;
+
   ~model_kv_cache() {
     if (ctx) {
       ne_free(ctx);
@@ -258,6 +275,13 @@ struct model_context {
 
   model_struct model;
   model_vocab vocab;
+  // maximum num of bearable requests in current env
+  int max_request_bs = 32;  // TODO
+  // num of current execution prompts
+  int request_running_bs = 1;
+  // length of current execution tokens list
+  // first token (prefill) generation is equal to `request_running_bs`
+  // next tokens (decoding) generation may be larger than `request_running_bs`(for example, beam search)
   int batch_size = 1;
   bool beam_search = false;
   bool shift_roped_k = false;     // whether to store non-RoPEd K cache
@@ -341,6 +365,30 @@ typedef struct model_token_data_array {
 
 typedef void (*model_progress_callback)(float progress, void* ctx);
 
+struct model_input {
+  // embd or next token
+  const model_token* tokens = nullptr;
+  // tokens length
+  uint32_t n_tokens = 0;
+  // prompt length
+  uint32_t n_prompt_tokens = 0;
+  // kv cache n_past
+  uint32_t n_past = 0;
+  // text tokens length (prompt + all next tokens)
+  // the number of tokens evaluated so far (including evicted tokens if there is any)
+  uint32_t n_total = 0;
+  // request id
+  int request_idx = -1;
+  // beam id in beam search
+  int beam_idx = 0;
+  // padding related, attention mask
+  // (0: left, 1: right)
+  // only support padding left in decoder only model
+  int padding_side = 0;
+  // padding length
+  uint32_t n_padding = 0;
+};
+
 struct model_context_params {
   model_archs arch;  // arch of models (GPT-J, LLAMA)
   int n_ctx;         // text context
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
index ac920f0bc2a..268f9dcb06c 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
@@ -77,6 +77,10 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
 #endif
 
   cache.buf.resize(n_layer * (layer_ne_k + layer_ne_v) * wsize + 2u * MB);
+  cache.seq_cells.resize(batch_size * beam_size);
+  for (int i = 0; i < cache.seq_cells.size(); ++i) {
+    cache.seq_cells[i].token_cells.resize(n_ctx);
+  }
 
   struct ne_init_params params;
   params.mem_size = cache.buf.size;
@@ -2039,11 +2043,127 @@ std::vector<std::pair<std::string, struct ne_tensor*>>& model_internal_get_tenso
   return ctx->model.tensors_by_name;
 }
 
+static void ne_model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src,
+                                      const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1) {
+  const uint32_t kv_n_ctx_block = ctx->kv_n_ctx_block;
+  const uint32_t n_head = ctx->model.hparams.n_head_kv > 0 ? ctx->model.hparams.n_head_kv : ctx->model.hparams.n_head;
+  const uint32_t head_dim = ctx->model.hparams.n_embd / ctx->model.hparams.n_head;
+  const uint32_t n_embd = n_head * head_dim;
+  const uint32_t n_ctx = ctx->n_ctx;
+  const size_t k_elem_size = ne_element_size(ctx->model.kv_self.k);
+  const size_t v_elem_size = ne_element_size(ctx->model.kv_self.v);
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
+    // [head_dim, N, n_head]
+    for (int nh = 0; nh < n_head; ++nh) {
+      memcpy(static_cast<char*>(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_dst * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx +
+                 p0 * k_elem_size * head_dim,
+             static_cast<char*>(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_src * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx +
+                 p0 * k_elem_size * head_dim,
+             k_elem_size * head_dim * (p1 - p0 + 1));
+    }
+  }
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
+    // [N, head_dim, n_head] or [N, n_embd]
+    for (int nm = 0; nm < n_embd; ++nm) {
+      memcpy(static_cast<char*>(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_dst * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size,
+             static_cast<char*>(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_src * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size,
+             v_elem_size * (p1 - p0 + 1));
+    }
+  }
+}
+
+void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src, const model_seq_id& seq_id_dst,
+                            const model_pos& p0, const model_pos& p1) {
+  if (ctx->model.kv_self.k->type != NE_TYPE_JBLAS) {
+    ne_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1);
+  } else {
+    return;
+    // jblas_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1);
+  }
+}
+
+static ne_tensor* ne_model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx,
+                                               struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1,
+                                               const int64_t& ne2, const int64_t& ne3,
+                                               const std::vector<int>& block_ids, const int& layer_idx,
+                                               const bool& concat_k) {
+  MODEL_ASSERT(ne3 == block_ids.size());  // moctx->batch_size
+  struct ne_tensor* cache = concat_k ? moctx->model.kv_self.k : moctx->model.kv_self.v;
+  // K = [head_dim, n_past+N, n_head, batch_size]
+  // V = [N_past+N, head_dim, n_head, batch_size]
+  const uint32_t n_embd_kv = concat_k ? ne0 * ne2 : ne1 * ne2;
+  struct ne_tensor* dst = nullptr;
+  if (concat_k) {
+    MODEL_ASSERT(ne1 <= moctx->n_ctx);
+  } else {
+    MODEL_ASSERT(ne0 <= moctx->n_ctx);
+  }
+  const size_t elem_size = ne_element_size(cache);
+  const size_t nb1 = concat_k ? elem_size * ne0 : elem_size * moctx->n_ctx;
+  const size_t nb2 = concat_k ? nb1 * moctx->n_ctx : nb1 * ne1;
+  const size_t nb3 = nb2 * ne2;
+  int cont_bs = 1;
+  int start_idx = block_ids[0];
+  int id = 1;
+  size_t dst_off = 0;
+  while (id < block_ids.size()) {
+    if (block_ids[id] - block_ids[id - 1] <= 1) {
+      cont_bs++;
+      id++;
+      continue;
+    } else {
+      if (dst == nullptr) {
+        dst = ne_new_tensor_4d(nectx, cache->type, ne0, ne1, ne2, ne3, NE_SIZE_CALC);
+      }
+      struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1,
+                                           elem_size * ne0 * ne1 * ne2, dst_off);
+      dst_off += elem_size * ne0 * ne1 * ne2 * cont_bs;
+      size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block +
+                   start_idx * moctx->n_ctx * elem_size * n_embd_kv;
+      ne_build_forward_expand(
+          cgraph, ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i));
+      start_idx = block_ids[id];
+      cont_bs = 1;
+      id++;
+    }
+  }
+
+  size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block +
+               start_idx * moctx->n_ctx * elem_size * n_embd_kv;
+  if (start_idx == block_ids[0]) {
+    // continuous among all batch tokens
+    return ne_view_4d(nectx, cache, ne0, ne1, ne2, ne3, nb1, nb2, nb3, off);
+  } else {
+    // last cont batch
+    struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1,
+                                         elem_size * ne0 * ne1 * ne2, dst_off);
+    ne_build_forward_expand(cgraph,
+                            ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i));
+    return dst;
+  }
+}
+
+ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx, struct ne_context* nectx,
+                                     const int64_t& ne0, const int64_t& ne1, const int64_t& ne2, const int64_t& ne3,
+                                     const std::vector<int>& block_ids, const int& layer_idx, const bool& concat_k) {
+  if (moctx->model.kv_self.k->type != NE_TYPE_JBLAS) {
+    return ne_model_kv_cache_seq_concat(cgraph, moctx, nectx, ne0, ne1, ne2, ne3, block_ids, layer_idx, concat_k);
+  } else {
+    return nullptr;  // jblas
+  }
+}
+
 // beam search
 // A struct for calculating logits-related info.
 struct logits_info {
   const model_context* const ctx = nullptr;
-  // (batch, seq_len * vocab_size)  batch = input_prompt_bs* beam_size
+  // [vocab_size * seq_len * batch]  batch = input_prompt_bs* beam_size
   const float* const logits = nullptr;
   const int batch_size;
   const int32_t n_vocab;
@@ -2116,117 +2236,96 @@ struct logits_info {
   }
 };
 
-void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) {
+void logits_processor::min_new_tokens_logits_process(const std::vector<uint32_t>& cur_lens,
+                                                     const model_vocab::id& eos_token_id) {
   MODEL_ASSERT(ctx->generation_conf.min_new_tokens >= 0);
-  if (ctx->generation_conf.min_new_tokens == 0 || ctx->generation_conf.min_new_tokens <= cur_len) {
+  if (ctx->generation_conf.min_new_tokens == 0) {
     return;
-  } else {
-    int batch_size = ctx->batch_size;
-    size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab;
-    size_t bs_stride = ctx->logits.size() / ctx->batch_size;
-    for (int i = 0; i < batch_size; ++i) {
-      // forbidden to choose eos_token if cur_len < min_new_tokens
-      *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = NEG_INF;
+  }
+  int batch_size = ctx->batch_size;
+  MODEL_ASSERT(batch_size == cur_lens.size());
+  size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab;
+  size_t bs_stride = ctx->logits.size() / ctx->batch_size;
+  for (int i = 0; i < batch_size; ++i) {
+    if (ctx->generation_conf.min_new_tokens <= cur_lens[i]) {
+      continue;
     }
+    // forbidden to choose eos_token if cur_len < min_new_tokens
+    *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = NEG_INF;
   }
 }
 
-void logits_processor::process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) {
+void logits_processor::process(const std::vector<uint32_t>& cur_lens, const model_vocab::id& eos_token_id) {
   MODEL_ASSERT(model_get_logits(ctx) != nullptr);
   if (min_new_tokens > 0) {
-    min_new_tokens_logits_process(cur_len, eos_token_id);
+    min_new_tokens_logits_process(cur_lens, eos_token_id);
   }
 }
 
-//  TODO dispatch JBLAS kv cache manager
-void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t& n_prompt_tokens,
+void beam_search_kv_cache_reorder::update(const std::vector<uint32_t>& n_past,
+                                          const std::vector<uint32_t>& n_prompt_tokens,
+                                          const std::vector<int> request_running_indices,
                                           const std::vector<std::tuple<int, int>>& kv_reorder_indices,
                                           const std::vector<beam>& next_beams) {
-  // TODO(Yi): use get_batch_kv_elements_from_gpt_params;
-  NE_ASSERT(ctx->model.kv_self.k->type != NE_TYPE_JBLAS);
-  // first step
-  if (n_past == n_prompt_tokens) {
-    // cpy batch 1 to all batches
-#pragma omp parallel for collapse(2)
-    for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
-      for (int j = 1; j < kv_n_ctx_block; ++j) {
-        // [n_embd, N]
-        memcpy(static_cast<char*>(ctx->model.kv_self.k->data) +
-                   (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                    j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd),
-               static_cast<char*>(ctx->model.kv_self.k->data) +
-                   i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block,
-               ne_element_size(ctx->model.kv_self.k) * n_embd * n_prompt_tokens);
-      }
-    }
-#pragma omp parallel for collapse(3)
-    for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
-      for (int j = 1; j < kv_n_ctx_block; ++j) {
-        // [N, n_embd]
-        for (int k = 0; k < n_embd; ++k) {
-          memcpy(static_cast<char*>(ctx->model.kv_self.v->data) +
-                     (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                      j * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd +
-                      n_ctx * k * ne_element_size(ctx->model.kv_self.v)),
-                 static_cast<char*>(ctx->model.kv_self.v->data) +
-                     (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                      n_ctx * k * ne_element_size(ctx->model.kv_self.v)),
-                 ne_element_size(ctx->model.kv_self.v) * n_prompt_tokens);
-        }
+  // TODO beam search unsupport shift kv cache when prompt + new_tokens > nctx
+  if (ctx->model.kv_self.has_shift) {
+    fprintf(stderr, "%s: error: unimplement shifted kv cache update\n", __func__);
+    return;
+  }
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+  printf("start to update kv cache for next step...\n");
+#endif
+  MODEL_ASSERT(request_running_indices.size() == ctx->request_running_bs);
+  if (!kv_reorder_indices.empty()) MODEL_ASSERT(kv_reorder_indices.size() == ctx->request_running_bs * ctx->beam_size);
+  int count = 0;
+  for (int rb = 0; rb < ctx->request_running_bs; ++rb) {
+    const int request_idx = request_running_indices[rb];
+    const uint32_t off = ctx->beam_size * request_idx;
+    const uint32_t cur_n_past = n_past[request_idx];
+    const uint32_t cur_n_prompt_tokens = n_prompt_tokens[request_idx];
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+    printf("------request_idx: %d, n_past: %d, n_prompt_tokens: %d, offset: %d------ \n", request_idx, cur_n_past,
+           cur_n_prompt_tokens, off);
+#endif
+    // first step
+    if (cur_n_past == cur_n_prompt_tokens) {
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+      printf("copy beam 1 first token to the left beams\n");
+#endif
+      for (int b = 1; b < ctx->beam_size; ++b) {
+        model_kv_cache_seq_cpy(ctx, 0 + off, b + off, 0, cur_n_prompt_tokens);
       }
-    }
-  } else if (n_past > n_prompt_tokens) {
-    // next setp
-    for (auto t : kv_reorder_indices) {
-      int cur_id = std::get<0>(t);
-      int cpy_id = std::get<1>(t);
-      if (cur_id != cpy_id) {
-        uint32_t len = next_beams[cur_id].token_ids.size() - 1;
-        // last token in beam is for next step inference
-        MODEL_ASSERT(len == n_past - n_prompt_tokens);
-        size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * n_embd;
-        size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v);
-        if (len + n_prompt_tokens > n_ctx) {
+    } else if (cur_n_past > cur_n_prompt_tokens) {
+      // next setp
+      for (int t = 0; t < ctx->beam_size; ++t) {
+        int cur_id = std::get<0>(kv_reorder_indices[count]);
+        int cpy_id = std::get<1>(kv_reorder_indices[count]);
+        count++;
+        if (cur_id == cpy_id) continue;
+        model_pos p0 = cur_n_prompt_tokens;
+        model_pos p1 = cur_n_past;
+        // TODO too long text
+        if (cur_n_past > n_ctx) {
           // all token hidden states cache should be updated
-          input_token_offset_k = 0;
-          input_token_offset_v = 0;
-          len = n_ctx;
-        }
-#pragma omp parallel for
-        for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
-          // [n_embd, N]
-          memcpy(static_cast<char*>(ctx->model.kv_self.k->data) +
-                     (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                      cur_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) +
-                     input_token_offset_k,
-                 static_cast<char*>(ctx->model.kv_self.k->data) +
-                     i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block +
-                     cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + input_token_offset_k,
-                 ne_element_size(ctx->model.kv_self.k) * n_embd * len);
-        }
-#pragma omp parallel for collapse(2)
-        for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
-          // [N, n_embd]
-          for (int k = 0; k < n_embd; ++k) {
-            memcpy(static_cast<char*>(ctx->model.kv_self.v->data) +
-                       (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                        cur_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd +
-                        n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v),
-                   static_cast<char*>(ctx->model.kv_self.v->data) +
-                       (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block +
-                        cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd +
-                        n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v),
-                   ne_element_size(ctx->model.kv_self.v) * len);
-          }
+          p0 = 0;
+          p1 = n_ctx;
         }
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+        printf("copy beam %d to beam %d in pos [%d, %d] \n", cpy_id, cur_id, p0, p1);
+#endif
+        model_kv_cache_seq_cpy(ctx, cpy_id + off, cur_id + off, p0, p1);
       }
+    } else {
+      fprintf(stderr, "%s: error: unable to update kv cache\n", __func__);
+      return;
     }
-  } else {
-    return;
   }
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+  printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
+#endif
 }
 
-// Return top k token_data by score. (prompt_bs * sample_scale * num_beam)
+// Return top k token_data by score. (request_running_bs * sample_scale * num_beam)
 // each beam gives top_k results --> + prev_scores --> from (num_beam * top_k) sort num_beam
 // for example, huggingface transformers repo implements like this:
 // log_softmax(num_beam*n_vocab) -- > + prev_scores --> sort num_beam
@@ -2234,19 +2333,24 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t
 // we sample top_k logits for each beam, than compute scores in these logits positions
 // then we sample top_k results among all beams.
 // this approach will accelerate sampling speed by log_softmax times reduction
-std::vector<beam_next_token> beam_search_flow::beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len,
+std::vector<beam_next_token> beam_search_flow::beam_top_k_next_tokens(model_context* ctx,
                                                                       const std::vector<float>& beams_score,
                                                                       const std::vector<int>& num_beams,
                                                                       const std::vector<int> beam_indices,
                                                                       const int& sample_scale, const int& dim) {
-  MODEL_ASSERT(dim == -1);   // raise unimplemented error
-  const int request_bs = 1;  // TODO ctx->request_running_num
+  MODEL_ASSERT(dim == -1);  // raise unimplemented error
+  // TODO different requests may have different num_beams (ctx->beam_size >= num_beams[i])?
+  const int request_running_bs = ctx->request_running_bs;
   logits_info li(ctx);
-  lp.process(cur_len, ctx->vocab.eos_token_id);
-  const int raw_k = sample_scale * beam_size;
+  std::vector<uint32_t> cur_lens;
+  for (int i = 0; i < ctx->batch_size; ++i) {
+    cur_lens.push_back(cur_beams[i].token_ids.size());
+  }
+  lp.process(cur_lens, ctx->vocab.eos_token_id);
+  const int raw_k = sample_scale * (*std::max_element(num_beams.begin(), num_beams.end()));
   // raw logits top_k
   std::vector<std::vector<beam_next_token>> raw_top_k = li.vocab_top_k(raw_k);
-  MODEL_ASSERT(raw_top_k.size() == ctx->batch_size);  // request_bs * num_beam
+  MODEL_ASSERT(raw_top_k.size() == ctx->batch_size);  // request_running_bs * num_beam
   MODEL_ASSERT(raw_top_k[0].size() == raw_k);
   MODEL_ASSERT(beams_score.size() == ctx->batch_size);
   // compute score: log_softmax + prev_score
@@ -2255,18 +2359,17 @@ std::vector<beam_next_token> beam_search_flow::beam_top_k_next_tokens(model_cont
     std::for_each(raw_top_k[i].begin(), raw_top_k[i].end(),
                   [&](beam_next_token& r) { r.score = li.log_probability_from_logit(i, r.score) + beams_score[i]; });
   }
-  MODEL_ASSERT(num_beams.size() == request_bs);
+  MODEL_ASSERT(num_beams.size() == request_running_bs);
   std::vector<beam_next_token> res;
   res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0));
-  std::vector<beam_next_token> min_heap;
   const uint32_t n_vocab = ctx->model.hparams.n_vocab;
   size_t row_off = 0;
   auto comp = [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; };
-  for (int i = 0; i < request_bs; ++i) {
+  for (int i = 0; i < request_running_bs; ++i) {
     const int num_beam = num_beams[i];
     const int sample_k = sample_scale * num_beam;
     MODEL_ASSERT(raw_k >= sample_k);
-    min_heap.clear();
+    std::vector<beam_next_token> min_heap;
     min_heap.reserve(sample_k);
     for (int j = 0; j < num_beam; ++j) {
       int n = 0;
@@ -2289,7 +2392,7 @@ std::vector<beam_next_token> beam_search_flow::beam_top_k_next_tokens(model_cont
         }
       }
     }
-    row_off += i * num_beam;
+    row_off += num_beam;
     std::sort(min_heap.begin(), min_heap.end(),
               [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; });
     for (const auto b : min_heap) {
@@ -2299,84 +2402,108 @@ std::vector<beam_next_token> beam_search_flow::beam_top_k_next_tokens(model_cont
   return res;
 }
 
-// TODO debug info unify (function ptr?)
 void beam_search_flow::fill_next_beams_by_top_scores() {
   auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; };
-  std::vector<model_token> embd_inp;
-  int record = 0;
+  std::vector<model_input> next_inputs;
   int batch_size = 0;
-  uint32_t cur_len = 0;
+  request_running_indices.clear();
   std::vector<int> beam_indices;
   std::vector<float> beams_score;
-  for (int i = 0; i < beam_size; ++i) {
-    MODEL_ASSERT(!cur_beams[i].eos());
-    if (cur_len != 0) {
-      MODEL_ASSERT(cur_len == cur_beams[i].token_ids.size());
+  // filter cur_beams
+  for (int i = 0; i < cur_beams.size(); ++i) {
+    if (cur_beams[i].done) continue;
+    if (request_running_indices.empty()) {
+      request_running_indices.push_back(cur_beams[i].request_idx);
     } else {
-      cur_len = cur_beams[i].token_ids.size();
+      if (request_running_indices.back() != cur_beams[i].request_idx) {
+        request_running_indices.push_back(cur_beams[i].request_idx);
+      }
     }
     // (batch, 1)
-    // ordered by infer_bs_id
-    embd_inp.push_back(cur_beams[i].token_ids.back());
+    // ordered by request_idx
+    next_inputs.push_back(model_input{
+        /*.tokens              =*/&cur_beams[i].token_ids.back(),
+        /*.n_tokens           =*/1,
+        /*.n_prompt_tokens    =*/n_prompt_tokens[request_running_indices.back()],
+        /*.n_past             =*/n_past[request_running_indices.back()],
+        /*.n_total            =*/n_total[request_running_indices.back()],
+        /*.request_idx        =*/request_running_indices.back(),
+        /*.beam_idx           =*/cur_beams[i].beam_idx,
+        /*.padding_side       =*/padding_side[request_running_indices.back()],
+        /*n_padding           =*/n_padding[request_running_indices.back()],
+    });
     batch_size++;
-    beam_indices.push_back(i);
+    beam_indices.push_back(cur_beams[i].beam_idx);
     beams_score.push_back(cur_beams[i].score);
   }
+  MODEL_ASSERT(request_running_indices.size() * beam_size == batch_size);
+  ctx->batch_size = batch_size;
+  ctx->request_running_bs = request_running_indices.size();
   // DEBUG
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
   printf("========================================================================================= \n");
   printf("next_tokens for inference: \n");
-  for (auto kk : embd_inp) {
+  printf("request_running_bs: %d, batch_size for inference: %d\n", ctx->request_running_bs, ctx->batch_size);
+  for (int k = 0; k < next_inputs.size(); ++k) {
+    model_token kk = *(next_inputs[k].tokens);
     printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str());
   }
   printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
 #endif
-  ctx->batch_size = batch_size;
-  int n_tokens = 1;
 
-  model_eval(ctx, embd_inp.data(), n_tokens, n_past, n_total, num_threads);
+  model_eval(ctx, next_inputs.data(), next_inputs.size(), num_threads);
 
   const int sample_scale = 2;
+  std::vector<int> num_beams(ctx->request_running_bs, beam_size);
   std::vector<beam_next_token> next_tokens =
-      beam_top_k_next_tokens(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale);
-
+      beam_top_k_next_tokens(ctx, beams_score, num_beams, beam_indices, sample_scale);
+  MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale);  // request_running_bs * beam_size * sample_scale
   // DEBUG
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
   printf("top_k next_tokens: \n");
-  for (auto kk : next_tokens) {
-    printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score,
-           kk.beam_idx);
+  int bb = 0;
+  for (int kk = 0; kk < next_tokens.size(); ++kk) {
+    if (kk % (beam_size * sample_scale) == 0) printf("------batch_%d------\n", bb++);
+    printf("%d: %s, score: %10.6f, beam_idx: %d \n", next_tokens[kk].id,
+           (ctx->vocab.id_to_token.at(next_tokens[kk].id).tok).c_str(), next_tokens[kk].score,
+           next_tokens[kk].beam_idx);
   }
   printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
 #endif
-  MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale);
-  MODEL_ASSERT(next_beams.empty());
-  for (int i = 0; i < next_tokens.size(); ++i) {
-    if (next_tokens[i].id == ctx->vocab.eos_token_id) {
-      // if beam_token does not belong to top num_beams tokens, it should not be added
-      bool is_beam_token_worse_than_top_num_beams = i >= beam_size ? true : false;
-      if (is_beam_token_worse_than_top_num_beams) {
-        continue;
+
+  const int rb_off = beam_size * sample_scale;
+  for (int rb = 0; rb < request_running_indices.size(); ++rb) {
+    int record_push = 0;
+    for (int nt = 0; nt < rb_off; ++nt) {
+      int i = rb * rb_off + nt;
+      int cb_off = next_tokens[i].beam_idx + request_running_indices[rb] * beam_size;
+      if (next_tokens[i].id == ctx->vocab.eos_token_id) {
+        // if beam_token does not belong to top num_beams tokens, it should not be added
+        bool is_beam_token_worse_than_top_num_beams = nt >= beam_size ? true : false;
+        if (is_beam_token_worse_than_top_num_beams) continue;
+        // update score with eos next token
+        cur_beams[cb_off].score = next_tokens[i].score;
+        beam_hypos[request_running_indices[rb]].add(cur_beams[cb_off], n_prompt_tokens[rb]);
+      } else {
+        beam next_beam = cur_beams[cb_off];
+        next_beam.token_ids.push_back(next_tokens[i].id);
+        next_beam.score = next_tokens[i].score;
+        next_beams[request_running_indices[rb] * beam_size + record_push] = std::move(next_beam);
+        record_push++;
+      }
+      if (record_push == beam_size) {
+        // sort by beam_idx rather than top-k for kv cache updating
+        std::sort(next_beams.begin() + request_running_indices[rb] * beam_size,
+                  next_beams.begin() + (request_running_indices[rb] + 1) * beam_size,
+                  [](beam& a, beam& b) { return a.beam_idx < b.beam_idx; });
+        break;
       }
-      // update score with eos next token
-      cur_beams[next_tokens[i].beam_idx].score = next_tokens[i].score;
-      beam_hypos[0].add(cur_beams[next_tokens[i].beam_idx], n_prompt_tokens);
-    } else {
-      beam next_beam = cur_beams[next_tokens[i].beam_idx];
-      next_beam.token_ids.push_back(next_tokens[i].id);
-      next_beam.score = next_tokens[i].score;
-      next_beams.push_back(std::move(next_beam));
-    }
-    if (next_beams.size() == beam_size) {
-      break;
     }
   }
-
-  std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; });
 }
 
 // get kv cache reorder indices,
-// idx_0: dst_beam batch idx, idx_1: src_beam batch idx
+// idx_0: dst_beam idx, idx_1: src_beam idx
 // for copy predicted past token kv cache
 // for example:
 //     - c
@@ -2389,145 +2516,164 @@ void beam_search_flow::fill_next_beams_by_top_scores() {
 // kv_cache_reorder_indices = {{0,0}, {1,0}}
 // if kv_cache_reorder_indices = {0:0, 1:1}, then do not need reorder (cpy)
 std::vector<std::tuple<int, int>> beam_search_flow::update_kv_cache_reorder_indices() {
-  MODEL_ASSERT(next_beams.size() == beam_size);
-  MODEL_ASSERT(cur_beams.size() == beam_size);
   // DEBUG
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
   printf("kv cache update indices info: \n");
-  printf("cur_beams: ");
-  for (int i = 0; i < beam_size; ++i) {
-    printf("%d, ", cur_beams[i].infer_bs_id);
+  printf("cur_beams:\n");
+  for (int bb = 0; bb < request_running_indices.size(); ++bb) {
+    printf("------batch_%d------\n", bb);
+    for (int i = 0; i < beam_size; ++i) {
+      printf("%d, ", cur_beams[i + request_running_indices[bb] * beam_size].beam_idx);
+    }
+    printf("\n");
   }
-  printf("\n");
-  printf("next_beams: ");
-  for (int i = 0; i < beam_size; ++i) {
-    printf("%d, ", next_beams[i].infer_bs_id);
+  printf("next_beams:\n");
+  for (int bb = 0; bb < request_running_indices.size(); ++bb) {
+    printf("------batch_%d------\n", bb);
+    for (int i = 0; i < beam_size; ++i) {
+      printf("%d, ", next_beams[i + request_running_indices[bb] * beam_size].beam_idx);
+    }
+    printf("\n");
   }
-  printf("\n");
 #endif
   std::vector<std::tuple<int, int>> kv_reorder_indices;
-  kv_reorder_indices.reserve(beam_size);
-  // shuffle beams which are early stopped (eos)
-  // keep them behind beams which have non-eos
-  // next_beams infer_bs_id: [0, 1(eos), 2(eos), 3] - > [0, 3, 1(eos), 2(eos)]
-  std::vector<int> cpy_eos_bs_ids;
-  std::vector<int> cpy_final_bs_ids;
-  std::vector<int> nb_eos_ids;
+  kv_reorder_indices.resize(ctx->request_running_bs * beam_size);
   std::vector<int> nb_shuffle_ids;
-  cpy_final_bs_ids.reserve(beam_size);
-  for (int i = 0; i < beam_size; ++i) {
-    MODEL_ASSERT(cur_beams[i].infer_bs_id == i);
-    if (next_beams[i].eos()) {
-      cpy_eos_bs_ids.push_back(next_beams[i].infer_bs_id);
-      nb_eos_ids.push_back(i);
+  for (int rb = 0; rb < request_running_indices.size(); ++rb) {
+    std::vector<int> cpy_final_bs_ids;
+    const int rb_off = request_running_indices[rb] * beam_size;
+    for (int i = 0; i < beam_size; ++i) {
+      MODEL_ASSERT(cur_beams[i + rb_off].beam_idx == i);
+      cpy_final_bs_ids.push_back(next_beams[i + rb_off].beam_idx);
+      // update beam_idx for next token generation
+      next_beams[i + rb_off].beam_idx = i;
+    }
+    // we arrange beams by inference batch indice rather score for memcpy time reduction
+    // so there will be 2 circumstances (ignore no memcpy : 0,1,2,3 --> 0,1,2,3)
+    // 1. cpoy former beams into latter beams, like: 0,1,2,3 --> 0,0,0,1
+    // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3
+    // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order
+    // so we give the contrary order to beams vector indice, which is:
+    // if 1, copy order is from tail to head
+    // if 2, copy order is from head to tail
+    bool cpy_from_head = true;
+    int dst_idx_sum = 0;
+    int src_idx_sum = 0;
+    for (int i = 0; i < cpy_final_bs_ids.size(); ++i) {
+      dst_idx_sum += i;
+      src_idx_sum += cpy_final_bs_ids[i];
+      if (src_idx_sum < dst_idx_sum) {
+        cpy_from_head = false;
+        break;
+      }
+    }
+    if (cpy_from_head) {
+      int insert_idx = 0;
+      for (int i = 0; i < cpy_final_bs_ids.size(); ++i) {
+        kv_reorder_indices[insert_idx + rb_off] = std::move(std::make_tuple(i, cpy_final_bs_ids[i]));
+        insert_idx++;
+      }
     } else {
-      cpy_final_bs_ids.push_back(next_beams[i].infer_bs_id);
-      nb_shuffle_ids.push_back(i);
-    }
-  }
-  cpy_final_bs_ids.insert(cpy_final_bs_ids.end(), cpy_eos_bs_ids.begin(), cpy_eos_bs_ids.end());
-  nb_shuffle_ids.insert(nb_shuffle_ids.end(), nb_eos_ids.begin(), nb_eos_ids.end());
-
-  // update indices and batch ids
-  for (int i = 0; i < beam_size; ++i) {
-    // update infer_bs_id before next beam generation
-    next_beams[nb_shuffle_ids[i]].infer_bs_id = i;
-  }
-  // beams should be ordered by batch id
-  std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; });
-
-  // we arrange beams by inference batch indice rather score for memcpy time reduction
-  // so there will be 2 circumstances (ignore no memcpy : 0,1,2,3 --> 0,1,2,3)
-  // 1. cpoy former beams into latter beams, like: 0,1,2,3 --> 0,0,0,1
-  // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3
-  // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order
-  // so we give the contrary order to beams vector indice, which is:
-  // if 1, copy order is from tail to head
-  // if 2, copy order is from head to tail
-  bool cpy_from_head = true;
-  int dst_idx_sum = 0;
-  int src_idx_sum = 0;
-  for (int i = 0; i < cpy_final_bs_ids.size(); ++i) {
-    dst_idx_sum += i;
-    src_idx_sum += cpy_final_bs_ids[i];
-    if (src_idx_sum < dst_idx_sum) {
-      cpy_from_head = false;
-      break;
+      int insert_idx = 0;
+      for (int i = cpy_final_bs_ids.size() - 1; i >= 0; --i) {
+        kv_reorder_indices[insert_idx + rb_off] = std::move(std::make_tuple(i, cpy_final_bs_ids[i]));
+        insert_idx++;
+      }
     }
-  }
-  if (cpy_from_head) {
-    for (int i = 0; i < cpy_final_bs_ids.size(); ++i) {
-      kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]});
+    // DEBUG
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+    printf("batch_%d cpy_final_bs_ids: ", rb);
+    for (int i = 0; i < beam_size; ++i) {
+      printf("%d, ", cpy_final_bs_ids[i]);
+    }
+    printf("\n");
+    printf("copy order: ");
+    if (cpy_from_head) {
+      printf("copy_from_head ---> \n");
+    } else {
+      printf("copy_from_tail <--- \n");
     }
-  } else {
-    for (int i = cpy_final_bs_ids.size() - 1; i >= 0; --i) {
-      kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]});
+    if (rb == request_running_indices.size() - 1) {
+      printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
     }
+#endif
   }
+  return kv_reorder_indices;
+}
 
-  // DEBUG
-#ifdef NE_BEAM_SEARCH_VERBOSE_ON
-  printf("cpy_final_bs_ids: ");
-  for (int i = 0; i < beam_size; ++i) {
-    printf("%d, ", cpy_final_bs_ids[i]);
-  }
-  printf("\n");
-  printf("nb_shuffle_ids: ");
-  for (int i = 0; i < beam_size; ++i) {
-    printf("%d, ", nb_shuffle_ids[i]);
-  }
-  printf("\n");
-  printf("next_beams after: ");
-  for (int i = 0; i < beam_size; ++i) {
-    printf("%d, ", next_beams[i].infer_bs_id);
+void beam_search_flow::update_status() {
+  // check if done
+  next_done_request_ids.clear();
+  for (int h = 0; h < beam_hypos.size(); ++h) {
+    if (requests_done[h]) continue;
+    const bool enough_new_tokens = (cur_beams[h * beam_size].token_ids.size() == ctx->generation_conf.max_new_tokens);
+    if (beam_hypos[h].is_done() || enough_new_tokens) {
+      requests_done[h] = true;
+      next_done_request_ids.push_back(h);
+      // mark  done beams of current request_idx
+      // batch reduction
+      std::for_each(cur_beams.begin() + h * beam_size, cur_beams.begin() + (h + 1) * beam_size,
+                    [&](auto& b) { b.done = true; });
+    }
   }
-  printf("\n");
-  printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
-#endif
-  return kv_reorder_indices;
 }
 
 // Return beam with highest probability.
-const beam& beam_search_flow::finalize() {
+const beam& beam_search_flow::finalize(const int& request_idx) {
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
   printf("========================================================================================= \n");
-  printf("finalize: \n");
+  printf("request_idx_%d finalize:\n", request_idx);
   printf("before: \n");
-  for (auto b : beam_hypos[0].beams) {
+  for (auto b : beam_hypos[request_idx].beams) {
     b.print();
   }
   printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
 #endif
-  if (!requests_done[0]) {
-    for (const auto b : cur_beams) {
-      beam_hypos[0].add(b, n_prompt_tokens);
+  if (!beam_hypos[request_idx].is_done()) {
+    for (int i = 0; i < beam_size; ++i) {
+      beam b = cur_beams[request_idx * beam_size + i];
+      beam_hypos[request_idx].add(b, n_prompt_tokens[request_idx]);
     }
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
     printf("after (adding more beams from outside): \n");
-    for (auto b : beam_hypos[0].beams) {
+    for (auto b : beam_hypos[request_idx].beams) {
       b.print();
     }
     printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
-    printf("========================================================================================= \n");
 #endif
   }
-  return beam_hypos[0].top1();
+  const beam& top_b = beam_hypos[request_idx].top1();
+#ifdef NE_BEAM_SEARCH_VERBOSE_ON
+  printf("final beam of request_idx %d:\n", request_idx);
+  top_b.print();
+  printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
+  printf("========================================================================================= \n");
+#endif
+  return top_b;
 }
 
-// TODO batch_size = 1 only
-// TODO batch prompt processing
-std::vector<model_token> beam_search_flow::loop(const model_token* tokens_inp, const int& n_tokens,
-                                                const int& n_threads) {
-  if (n_tokens > model_n_ctx(ctx)) {
-    fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, n_tokens, model_n_ctx(ctx) - 4);
-    return std::vector<model_token>();
+const std::vector<std::vector<model_token>>& beam_search_flow::loop(const std::vector<model_input>& inputs,
+                                                                    const int& n_threads) {
+  // n_past, n_tokens, n_prompt_tokens should be same among batches in static batching inference
+  n_tokens.assign(request_bs, inputs[0].n_tokens);
+  if (n_tokens[0] > model_n_ctx(ctx)) {
+    fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, n_tokens[0], model_n_ctx(ctx) - 4);
+    return response;
   }
   num_threads = n_threads;
-  n_prompt_tokens = n_tokens;
-  std::vector<model_token> beam_search_response;
-  std::vector<model_token> embd(tokens_inp, tokens_inp + n_tokens);
+  n_past.assign(request_bs, 0);
+  n_prompt_tokens.assign(request_bs, n_tokens[0]);
+  n_total.assign(request_bs, 0);
+  for (const auto& input : inputs) {
+    padding_side.push_back(input.padding_side);
+    n_padding.push_back(input.n_padding);
+  }
 
-  ctx->batch_size = 1;
+  ctx->batch_size = request_bs;
+  ctx->request_running_bs = request_bs;
+  for (int i = 0; i < request_bs; ++i) {
+    request_running_indices.push_back(i);
+  }
   const uint32_t max_new_tokens = ctx->generation_conf.max_new_tokens;
 
   // Loop ends in: 1. all requests done; or 2. reach max_new_tokens length
@@ -2536,47 +2682,55 @@ std::vector<model_token> beam_search_flow::loop(const model_token* tokens_inp, c
   if (kv_reorder == nullptr) {
     kv_reorder = std::make_shared<beam_search_kv_cache_reorder>(ctx);
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
-    printf("WARNING: using default kv cache update function. \n");
+    printf(
+        "WARNING: Using default kv cache update function. Ignore this warning if your K shape = [head_dim, N, n_head], "
+        "V shape = [N, head_dim, n_head]\n");
 #endif
   }
-  beam_hypos.push_back(beam_hypotheses(ctx));  // TODO ctx->request_running_bs;
-  requests_done.push_back(false);
   for (int n = 0; n < max_new_tokens; ++n) {
     // first step
-    if (n_past == 0) {
-      model_eval(ctx, embd.data(), n_tokens, n_past, n_total, num_threads);
-      n_past += n_tokens;
-      n_total += n_tokens;
-      kv_reorder->update(n_past, n_tokens);
-      std::vector<beam_next_token> next_tokens = beam_top_k_next_tokens(ctx, 0, {0.0f}, {1}, {0}, beam_size);
-      MODEL_ASSERT(next_tokens.size() == beam_size);
-      cur_beams.clear();
+    if (n_past[0] == 0) {
+      model_eval(ctx, inputs.data(), inputs.size(), num_threads);
+      std::for_each(n_past.begin(), n_past.end(), [&](auto& n) { n += n_tokens[0]; });
+      std::for_each(n_total.begin(), n_total.end(), [&](auto& n) { n += n_tokens[0]; });
+      kv_reorder->update(n_past, n_prompt_tokens, request_running_indices);
+      std::vector<float> beam_scores(ctx->batch_size, 0.0f);
+      std::vector<int> num_beams(ctx->request_running_bs, 1);
+      std::vector<int> beam_indices(ctx->batch_size, 0);
+      std::vector<beam_next_token> next_tokens =
+          beam_top_k_next_tokens(ctx, beam_scores, num_beams, beam_indices, beam_size);
+      MODEL_ASSERT(next_tokens.size() == ctx->request_running_bs * beam_size);
       // DEBUG
 #ifdef NE_BEAM_SEARCH_VERBOSE_ON
       printf("========================================================================================== \n");
       printf("top_k next_tokens: \n");
-      for (auto kk : next_tokens) {
-        printf("%d: %s, score: %12.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(),
-               kk.score, kk.beam_idx);
+      int bb = 0;
+      for (int kk = 0; kk < next_tokens.size(); ++kk) {
+        if (kk % beam_size == 0) printf("------batch_%d------\n", bb++);
+        printf("%d: %s, score: %10.6f, beam_idx: %d \n", next_tokens[kk].id,
+               (ctx->vocab.id_to_token.at(next_tokens[kk].id).tok).c_str(), next_tokens[kk].score,
+               next_tokens[kk].beam_idx);
       }
       printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
 #endif
-      for (int i = 0; i < beam_size; ++i) {
-        beam b;
-        b.ctx = ctx;
-        b.token_ids.push_back(next_tokens[i].id);
-        b.score = next_tokens[i].score;
-        b.infer_bs_id = i;
-        cur_beams.push_back(b);
+      for (int rb = 0; rb < request_running_indices.size(); ++rb) {
+        for (int i = 0; i < beam_size; ++i) {
+          beam b;
+          b.ctx = ctx;
+          b.token_ids.push_back(next_tokens[i + rb * beam_size].id);
+          b.score = next_tokens[i + rb * beam_size].score;
+          b.beam_idx = i;
+          b.request_idx = request_running_indices[rb];
+          cur_beams[request_running_indices[rb] * beam_size + i] = std::move(b);
+        }
       }
     } else {
       fill_next_beams_by_top_scores();
       std::vector<std::tuple<int, int>> kv_reorder_indices = update_kv_cache_reorder_indices();
-      n_past += 1;
-      n_total += 1;
-      kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams);
+      std::for_each(n_past.begin(), n_past.end(), [&](auto& n) { n++; });
+      std::for_each(n_total.begin(), n_total.end(), [&](auto& n) { n++; });
+      kv_reorder->update(n_past, n_prompt_tokens, request_running_indices, kv_reorder_indices, next_beams);
       cur_beams.swap(next_beams);
-      next_beams.clear();
     }
 
     // DEBUG: print current beams for this iteration
@@ -2590,41 +2744,22 @@ std::vector<model_token> beam_search_flow::loop(const model_token* tokens_inp, c
     printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
 #endif
 
-    // check if done
-    for (int h = 0; h < beam_hypos.size(); ++h) {
-      if (requests_done[h]) {
-        continue;
-      }
-      if (beam_hypos[h].is_done()) {
-        requests_done[h] = true;
-      }
-    }
-    auto const done_or_not = [](const bool& flag) { return flag; };
-    if (std::all_of(requests_done.begin(), requests_done.end(), done_or_not)) {
-      break;
+    update_status();
+    // collect request final generation result if done
+    for (const auto& didx : next_done_request_ids) {
+      const beam& top_b = finalize(didx);
+      response[didx] = top_b.token_ids;
     }
+    // return if all requests done in static batching
+    if (std::find(requests_done.begin(), requests_done.end(), false) == requests_done.end()) break;
   }
 
-  const beam& top_b = finalize();
-
-#ifdef NE_BEAM_SEARCH_VERBOSE_ON  // DEBUG: print final beam result
-  printf("========================================================================================= \n");
-  printf("final beam:\n");
-  top_b.print();
-  printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n");
-  printf("========================================================================================= \n");
-#endif
-
-  beam_search_response.clear();
-  for (const auto& id : top_b.token_ids) {
-    beam_search_response.push_back(id);
-  }
-  return beam_search_response;
+  return response;
 }
 
-std::vector<model_token> beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp,
-                                     const int& n_tokens, const int& n_threads) {
+std::vector<std::vector<model_token>> beam_search(model_context* lctx, const int& n_predict,
+                                                  const std::vector<model_input>& inputs, const int& n_threads) {
   lctx->generation_conf.max_new_tokens = n_predict;
-  beam_search_flow bsf(lctx);
-  return bsf.loop(tokens_inp, n_tokens, n_threads);
+  beam_search_flow bsf(lctx, inputs.size());
+  return bsf.loop(inputs, n_threads);
 }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h
index bb053b48477..dd87e1941bc 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h
@@ -119,12 +119,12 @@ MODEL_API bool model_save_session_file(struct model_context* ctx, const char* pa
                                        size_t n_token_total);
 
 // Run the model inference to obtain the logits and probabilities for the next
+// model_input has some necessary members for inference (more details please see model_types.h):
 // token. tokens + n_tokens is the provided batch of new tokens to process
 // n_past is the offset to which the kv is cached to
 // n_total is the number of tokens evaluated in previous eval calls
 // Returns 0 on success
-MODEL_API int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-                         int n_threads);
+MODEL_API int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads);
 
 // Convert the provided text into tokens.
 // The tokens pointer must be large enough to hold the resulting tokens.
@@ -259,31 +259,64 @@ MODEL_API const char* model_print_system_info(void);
 }
 #endif
 
+/* kv cache utils */
+// kv cache both stores permuted tensor
+// k shape is [head_dim, N, n_head]
+// v shape is [N, head_dim, n_head] or [N, n_embd]
+/* kv cache utils */
+
+// copy consecutive tokens from one seq to another
+MODEL_API void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src,
+                                      const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1);
+
+// concat several seqs into a continuous batch from kv cache
+MODEL_API ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx,
+                                               struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1,
+                                               const int64_t& ne2, const int64_t& ne3,
+                                               const std::vector<int>& block_ids, const int& layer_idx,
+                                               const bool& concat_k = true);
+
 /*  beam search utils  */
 #define NEG_INF -std::numeric_limits<float>::max()
 
 typedef struct beam_next_token {
-  model_token id;  // token id
-  float score;     // score of the token
-  int beam_idx;    // token in which beam (-1 means unknown)
+  model_token id = -1;  // token id
+  float score = 0.0f;   // score of the token
+  int beam_idx = -1;    // token in which beam (-1 means unknown)
 } beam_next_token;
 
 struct beam {
   const model_context* ctx = nullptr;
   std::vector<model_token> token_ids;
   // Cumulative beam score (log-softmax here)
-  float score;
-  // record inference batch indice
-  int infer_bs_id;
+  float score = 0.0f;
+  // record related indices
+  // 0 - request_bs-1
+  int request_idx = -1;
+  // 0 - num_beams-1
+  int beam_idx = -1;
+  // if stop generation (append new token_id)
+  bool done = false;
+
   // end-of-text
   const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; }
+
   void print() const {
-    printf("length: %d, score: %12.6f, eos: %d, tokens:\n", token_ids.size(), score, eos());
+    printf("length: %d, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
+           score, eos(), request_idx, beam_idx, done);
     for (const auto& id : token_ids) {
       printf("%d: %s, ", id, model_token_to_str(ctx, id));
     }
     printf("\n");
   }
+
+  void clear() {
+    token_ids.clear();
+    score = 0.0f;
+    request_idx = -1;
+    beam_idx = -1;
+    done = false;
+  }
 };
 
 struct beam_hypotheses {
@@ -345,6 +378,8 @@ struct beam_hypotheses {
     auto const by_score = [](beam const& a, beam const& b) { return a.score < b.score; };
     return *std::max_element(beams.begin(), beams.end(), by_score);
   }
+
+  void clear() { beams.clear(); }
 };
 
 struct logits_info;
@@ -354,8 +389,8 @@ class logits_processor {
   explicit logits_processor(model_context* lctx) : ctx(lctx), min_new_tokens(lctx->generation_conf.min_new_tokens) {}
   ~logits_processor() {}
 
-  void process(const uint32_t& cur_len, const model_vocab::id& eos_token_id);
-  void min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id);
+  void process(const std::vector<uint32_t>& cur_lens, const model_vocab::id& eos_token_id);
+  void min_new_tokens_logits_process(const std::vector<uint32_t>& cur_lens, const model_vocab::id& eos_token_id);
 
  private:
   model_context* ctx = nullptr;
@@ -370,66 +405,83 @@ class logits_processor {
 class beam_search_kv_cache_reorder {
  public:
   explicit beam_search_kv_cache_reorder(model_context* lctx)
-      : ctx(lctx),
-        n_ctx(lctx->n_ctx),
-        n_embd(lctx->model.hparams.n_embd),
-        head_dim(lctx->model.hparams.n_embd / lctx->model.hparams.n_head),
-        n_head(lctx->model.hparams.n_head),
-        kv_n_ctx_block(lctx->kv_n_ctx_block) {}
-  ~beam_search_kv_cache_reorder() {}
-
-  virtual void update(const uint32_t& n_past, const uint32_t& n_prompt_tokens,
+      : ctx(lctx), n_ctx(lctx->n_ctx), kv_n_ctx_block(lctx->kv_n_ctx_block) {}
+  virtual ~beam_search_kv_cache_reorder() {}
+
+  virtual void update(const std::vector<uint32_t>& n_past, const std::vector<uint32_t>& n_prompt_tokens,
+                      const std::vector<int> request_running_indices,
                       const std::vector<std::tuple<int, int>>& kv_reorder_indices = {},
                       const std::vector<beam>& next_beams = {});
 
  protected:
   model_context* ctx = nullptr;
   const uint32_t n_ctx;
-  const uint32_t n_embd;
-  // const uint32_t n_head_kv;
-  const uint32_t head_dim;
-  const uint32_t n_head;
   const uint32_t kv_n_ctx_block;
 };
 
 class beam_search_flow {
  public:
-  explicit beam_search_flow(model_context* lctx) : ctx(lctx), beam_size(lctx->beam_size), lp(logits_processor(lctx)) {
-    cur_beams.reserve(beam_size);
-    next_beams.reserve(beam_size);
-    cur_beams.push_back({ctx, {}, 0.0f});
+  explicit beam_search_flow(model_context* lctx, const int batch_size = 1)
+      : ctx(lctx), beam_size(lctx->beam_size), request_bs(batch_size), lp(logits_processor(lctx)) {
+    cur_beams.resize(batch_size * beam_size);
+    next_beams.resize(batch_size * beam_size);
+    for (int i = 0; i < batch_size; ++i) {
+      beam_hypos.push_back(std::move(beam_hypotheses(lctx)));
+    }
+    response.resize(batch_size);
+    requests_done.assign(batch_size, false);
+    request_running_indices.reserve(batch_size);
+    next_done_request_ids.reserve(batch_size);
+    n_tokens.reserve(batch_size);
+    n_past.reserve(batch_size);
+    n_prompt_tokens.reserve(batch_size);
+    n_total.reserve(batch_size);
+    padding_side.reserve(batch_size);
+    n_padding.reserve(batch_size);
   }
   ~beam_search_flow() {}
 
   // public interface
-  std::vector<model_token> loop(const model_token* tokens_inp, const int& n_tokens, const int& n_threads);
+  // static batching (padding inputs or batch = 1)
+  const std::vector<std::vector<model_token>>& loop(const std::vector<model_input>& inputs, const int& n_threads);
+  // continuous batching (scheduling from the outside)
+  void step(model_token* dst);  // TODO one step
 
  private:
-  std::vector<beam_next_token> beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len,
-                                                      const std::vector<float>& beams_score,
+  std::vector<beam_next_token> beam_top_k_next_tokens(model_context* ctx, const std::vector<float>& beams_score,
                                                       const std::vector<int>& num_beams,
                                                       const std::vector<int> beam_indices, const int& sample_scale = 2,
                                                       const int& dim = -1);
   void fill_next_beams_by_top_scores();
   std::vector<std::tuple<int, int>> update_kv_cache_reorder_indices();
-  const beam& finalize();
+  void update_status();
+  const beam& finalize(const int& request_idx);
 
   model_context* ctx = nullptr;
   const int beam_size;
+  const int request_bs;  // could be the max bs in continuous batching mechanism
   std::vector<beam> cur_beams;
   std::vector<beam> next_beams;
   std::vector<beam_hypotheses> beam_hypos;
   std::vector<bool> requests_done;
-  uint32_t n_past = 0;
-  uint32_t n_total = 0;
-  uint32_t n_prompt_tokens = 0;
+  std::vector<int> request_running_indices;
+  std::vector<int> next_done_request_ids;
+  std::vector<uint32_t> n_tokens;
+  std::vector<uint32_t> n_past;
+  std::vector<uint32_t> n_prompt_tokens;
+  std::vector<uint32_t> n_total;
+  std::vector<int> padding_side;
+  std::vector<uint32_t> n_padding;
   int num_threads = 4;  // default by 4
   logits_processor lp;
   std::shared_ptr<beam_search_kv_cache_reorder> kv_reorder;
+  std::vector<std::vector<model_token>> response;
 };
 
-MODEL_API std::vector<model_token> beam_search(model_context* lctx, const int& n_predict, const model_token* tokens_inp,
-                                               const int& n_tokens, const int& n_threads);
+// static batching generation
+MODEL_API std::vector<std::vector<model_token>> beam_search(model_context* lctx, const int& n_predict,
+                                                            const std::vector<model_input>& inputs,
+                                                            const int& n_threads);
 
 // Internal API to be implemented by model.cpp and used by tests/benchmarks only
 #ifdef MODEL_API_INTERNAL
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp
index 48987264152..11f651d7031 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.cpp
@@ -40,18 +40,20 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
-static bool mpt_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                    const int n_past, const int n_total, const int n_threads) {
+static bool mpt_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                    const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
-
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -112,7 +114,9 @@ static bool mpt_model_eval_internal(model_context& lctx, const model_token* toke
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
 
@@ -352,9 +356,8 @@ static bool mpt_model_eval_internal(model_context& lctx, const model_token* toke
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!mpt_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!mpt_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp
index f9122aa5bc8..c2c17d1d1ed 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/opt/opt.cpp
@@ -37,18 +37,22 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
 #define OPT_POS_EMBD_OFFS 2
 
-static bool opt_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                    const int n_past, const int n_total, const int n_threads) {
+static bool opt_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                    const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
+  const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -84,7 +88,9 @@ static bool opt_model_eval_internal(model_context& lctx, const model_token* toke
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   /* class OPTLearnedPositionalEmbedding(nn.Embedding)
         attention_mask = attention_mask.long()
@@ -366,9 +372,8 @@ static bool opt_model_eval_internal(model_context& lctx, const model_token* toke
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!opt_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!opt_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp
index 36ca2545598..d98f6fb3373 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/starcoder/starcoder.cpp
@@ -40,18 +40,21 @@
 // evaluate the transformer
 //
 //   - lctx:      model context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the offset to which the kv is cached to
-//   - n_total:   the number of tokens evaluated so far (including evicted tokens if there is any)
+//   - inputs:    model_input array
+//   - n_input    num of model_input
 //   - n_threads: number of threads to use
 //
-static bool starcoder_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
-                                          const int n_past, const int n_total, const int n_threads) {
+static bool starcoder_model_eval_internal(model_context& lctx, const model_input* inputs, const int n_input,
+                                          const int n_threads) {
   const int64_t t_start_us = ne_time_us();
 
-  const int N = n_tokens;
+  // TODO static batching for now
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
 
   const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
 
   const auto& model = lctx.model;
   const auto& hparams = model.hparams;
@@ -113,7 +116,9 @@ static bool starcoder_model_eval_internal(model_context& lctx, const model_token
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");
-  memcpy(embd->data, tokens, N * ne_element_size(embd));
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
+  }
 
   struct ne_tensor* position = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   for (int i = 0; i < N; ++i) {
@@ -438,9 +443,8 @@ static bool starcoder_model_eval_internal(model_context& lctx, const model_token
   return true;
 }
 
-int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_total,
-               int n_threads) {
-  if (!starcoder_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_total, n_threads)) {
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!starcoder_model_eval_internal(*ctx, inputs, n_input, n_threads)) {
     fprintf(stderr, "%s: failed to eval\n", __func__);
     return 1;
   }
diff --git a/tests/test_llm_runtime.py b/tests/test_llm_runtime.py
index 4b147bf7f50..315c848224c 100644
--- a/tests/test_llm_runtime.py
+++ b/tests/test_llm_runtime.py
@@ -4,7 +4,9 @@
 import unittest
 
 from transformers import AutoTokenizer, TextStreamer
-from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
+from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig, AutoModelForCausalLM
+from intel_extension_for_transformers.llm.runtime.graph.scripts.convert import convert_model
+from intel_extension_for_transformers.llm.runtime.graph import Model
 
 
 class TestLLMRUNTIME(unittest.TestCase):
@@ -16,6 +18,7 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls) -> None:
         shutil.rmtree("./ne_chatglm_q.bin", ignore_errors=True)
+        shutil.rmtree("./gptj_fp32.bin", ignore_errors=True)
 
     def test_llm_runtime(self):
 
@@ -28,4 +31,39 @@ def test_llm_runtime(self):
         streamer = TextStreamer(tokenizer)
 
         model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True, trust_remote_code=True)
-        gen_tokens = model.generate(input_ids, streamer=streamer, max_new_tokens=300)
\ No newline at end of file
+        gen_tokens = model.generate(input_ids, streamer=streamer, max_new_tokens=300)
+
+    def test_beam_search(self):
+        model_name = "/tf_dataset2/models/pytorch/gpt-j-6B"  # or local path to model
+        prompts = [
+           "she opened the door and see",
+           "tell me 10 things about jazz music",
+           "What is the meaning of life?",
+           "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer"\
+            " The slings and arrows of outrageous fortune, "\
+            "Or to take arms against a sea of troubles."\
+            "And by opposing end them. To die—to sleep,"
+            ]
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
+                                                  padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+        pad_token = tokenizer(tokenizer.pad_token)['input_ids'][0]
+        inputs = tokenizer(prompts, padding=True, return_tensors='pt')
+
+        # pytorch fp32
+        pt_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+        pt_model.eval()
+        pt_generate_ids = pt_model.generate(**inputs, max_new_tokens=128, min_new_tokens=30,
+                                            early_stopping=True, num_beams=4).tolist()
+        # llm runtime fp32
+        convert_model(model_name, "gptj_fp32.bin", "f32")
+        itrex_model = Model()
+        itrex_model.init_from_bin("gptj", "gptj_fp32.bin", batch_size=4, num_beams=4,
+                                  max_new_tokens=128, min_new_tokens=30, early_stopping=True,
+                                  pad_token=pad_token)
+        itrex_generate_ids = itrex_model.generate(inputs.input_ids, batch_size=4, num_beams=4,
+                                  max_new_tokens=128, min_new_tokens=30, early_stopping=True,
+                                  pad_token=pad_token)
+        for i in range(len(itrex_generate_ids)):
+            self.assertListEqual(pt_generate_ids[i], itrex_generate_ids[i])