From bffa1b0e5d791f24bfc7efc49df1d2f64147df3d Mon Sep 17 00:00:00 2001 From: luoyu-intel Date: Thu, 14 Sep 2023 16:08:57 +0800 Subject: [PATCH] [Graph] windows build (#312) * fix win build error * add win header * modify MD * clang-format 14 --- .../llm/runtime/graph/README.md | 8 ++++++++ .../llm/runtime/graph/application/common.h | 1 + .../llm/runtime/graph/core/layers/mha_dense.cpp | 2 +- .../llm/runtime/graph/models/chatglm/chatglm2_utils.cpp | 8 ++++---- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md index 988bff79241..639f8113cf5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/README.md +++ b/intel_extension_for_transformers/llm/runtime/graph/README.md @@ -37,12 +37,20 @@ We support the following models: ## How to use ### 1. Build LLM Runtime +Linux ```shell mkdir build cd build cmake .. -G Ninja ninja ``` +Windows: install VisualStudio 2022(a validated veresion), search 'Developer PowerShell for VS 2022' and open it, then run the following cmds. +```powershell +mkdir build +cd build +cmake .. +cmake --build . -j +``` ### 2. Convert LLM LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps: diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/common.h b/intel_extension_for_transformers/llm/runtime/graph/application/common.h index ec66c7a42de..7cf34580c33 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/common.h +++ b/intel_extension_for_transformers/llm/runtime/graph/application/common.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "core/data_types.h" #include "core/ne_layers.h" diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp b/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp index fd30fac0c30..99d6bcb4cf8 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp @@ -1759,7 +1759,7 @@ void jblas_fusion_attn_fp32_fp16_fp16_fp32_forward(const attn_fp32_fp16_fp16_fp3 // return jblas_fusion_attn_forward_ref(*reinterpret_cast*>(params)); } -bool blas_fusion_attn_fp16_support(const attn_shape_t* params) { +bool jblas_fusion_attn_fp16_support(const attn_shape_t* params) { #if CompileFP16() GetCPUDevice(); // TODO check K V's layout diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp index a3b9993f9a0..be5b053aa49 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp @@ -144,10 +144,10 @@ void CHATGLM2::load(model_context& lctx, model_progress_callback progress_callba layer.attn[2] = ml->get_tensor(layers_i + ".self_attention.dense.weight", {n_embd, n_embd}, backend); // ffn GEMM - layer.ffn[0] = - ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight", {n_embd, model.hparams.ffn_hidden_size * 2}, backend); - layer.ffn[1] = - ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight", {model.hparams.ffn_hidden_size, n_embd}, backend); + layer.ffn[0] = ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight", + {n_embd, uint32_t(model.hparams.ffn_hidden_size * 2)}, backend); + layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight", + {uint32_t(model.hparams.ffn_hidden_size), n_embd}, backend); layer.k_cache = d_ne_new_tensor_3d(model.ctx, NE_TYPE_F16, 4096 / 32, 32768, 2); layer.v_cache = d_ne_new_tensor_3d(model.ctx, NE_TYPE_F16, 32768, 4096 / 32, 2);