From bffa1b0e5d791f24bfc7efc49df1d2f64147df3d Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 14 Sep 2023 16:08:57 +0800
Subject: [PATCH] [Graph] windows build (#312)

* fix win build error

* add win header

* modify MD

* clang-format 14
---
 .../llm/runtime/graph/README.md                           | 8 ++++++++
 .../llm/runtime/graph/application/common.h                | 1 +
 .../llm/runtime/graph/core/layers/mha_dense.cpp           | 2 +-
 .../llm/runtime/graph/models/chatglm/chatglm2_utils.cpp   | 8 ++++----
 4 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
index 988bff79241..639f8113cf5 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/README.md
+++ b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -37,12 +37,20 @@ We support the following models:
 ## How to use
 
 ### 1. Build LLM Runtime
+Linux
 ```shell
 mkdir build
 cd build
 cmake .. -G Ninja
 ninja
 ```
+Windows: install VisualStudio 2022(a validated veresion), search 'Developer PowerShell for VS 2022' and open it, then run the following cmds.
+```powershell
+mkdir build
+cd build
+cmake ..
+cmake --build . -j
+```
 
 ### 2. Convert LLM
 LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/common.h b/intel_extension_for_transformers/llm/runtime/graph/application/common.h
index ec66c7a42de..7cf34580c33 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/application/common.h
+++ b/intel_extension_for_transformers/llm/runtime/graph/application/common.h
@@ -25,6 +25,7 @@
 #include <random>
 #include <regex>
 #include <thread>
+#include <functional>
 
 #include "core/data_types.h"
 #include "core/ne_layers.h"
diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp b/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp
index fd30fac0c30..99d6bcb4cf8 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/core/layers/mha_dense.cpp
@@ -1759,7 +1759,7 @@ void jblas_fusion_attn_fp32_fp16_fp16_fp32_forward(const attn_fp32_fp16_fp16_fp3
   // return jblas_fusion_attn_forward_ref(*reinterpret_cast<const attn_fwd_args_t<float, fp16, fp16, float>*>(params));
 }
 
-bool blas_fusion_attn_fp16_support(const attn_shape_t* params) {
+bool jblas_fusion_attn_fp16_support(const attn_shape_t* params) {
 #if CompileFP16()
   GetCPUDevice();
   // TODO check K V's layout
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp
index a3b9993f9a0..be5b053aa49 100644
--- a/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp
+++ b/intel_extension_for_transformers/llm/runtime/graph/models/chatglm/chatglm2_utils.cpp
@@ -144,10 +144,10 @@ void CHATGLM2::load(model_context& lctx, model_progress_callback progress_callba
     layer.attn[2] = ml->get_tensor(layers_i + ".self_attention.dense.weight", {n_embd, n_embd}, backend);
 
     // ffn GEMM
-    layer.ffn[0] =
-        ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight", {n_embd, model.hparams.ffn_hidden_size * 2}, backend);
-    layer.ffn[1] =
-        ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight", {model.hparams.ffn_hidden_size, n_embd}, backend);
+    layer.ffn[0] = ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight",
+                                  {n_embd, uint32_t(model.hparams.ffn_hidden_size * 2)}, backend);
+    layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight",
+                                  {uint32_t(model.hparams.ffn_hidden_size), n_embd}, backend);
 
     layer.k_cache = d_ne_new_tensor_3d(model.ctx, NE_TYPE_F16, 4096 / 32, 32768, 2);
     layer.v_cache = d_ne_new_tensor_3d(model.ctx, NE_TYPE_F16, 32768, 4096 / 32, 2);