From c5b3c1dd1c6d5ead00e8b5a077f5a149f87f6e01 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 17 Mar 2021 07:14:24 +0000
Subject: [PATCH 01/19] test=develop

---
 CMakeLists.txt                                |   1 +
 cmake/configure.cmake                         |   4 +
 cmake/lite.cmake                              |  33 ++-
 docs/demo_guides/intel_fpga.md                | 107 ++++++++++
 lite/api/CMakeLists.txt                       |  22 +-
 lite/backends/CMakeLists.txt                  |   1 +
 lite/backends/intelfpga/CMakeLists.txt        |  23 +++
 .../backends/intelfpga/lldrv/intelfpgadrv.cpp | 192 ++++++++++++++++++
 lite/backends/intelfpga/lldrv/intelfpgadrv.h  | 186 +++++++++++++++++
 lite/backends/intelfpga/lldrv/utils.cpp       |  72 +++++++
 lite/backends/intelfpga/lldrv/utils.h         |  33 +++
 lite/backends/intelfpga/target_wrapper.cpp    |  38 ++++
 lite/backends/intelfpga/target_wrapper.h      |  60 ++++++
 lite/core/CMakeLists.txt                      |   1 +
 lite/core/context.h                           |  26 +++
 lite/kernels/CMakeLists.txt                   |   1 +
 lite/kernels/intelfpga/CMakeLists.txt         |   9 +
 lite/kernels/intelfpga/conv_compute.cc        |  99 +++++++++
 lite/kernels/intelfpga/conv_compute.h         |  55 +++++
 lite/kernels/intelfpga/conv_depthwise.cc      | 128 ++++++++++++
 lite/kernels/intelfpga/conv_depthwise.h       |  67 ++++++
 lite/kernels/intelfpga/conv_gemmlike.cc       | 185 +++++++++++++++++
 lite/kernels/intelfpga/conv_gemmlike.h        | 112 ++++++++++
 23 files changed, 1447 insertions(+), 8 deletions(-)
 create mode 100644 docs/demo_guides/intel_fpga.md
 create mode 100644 lite/backends/intelfpga/CMakeLists.txt
 create mode 100644 lite/backends/intelfpga/lldrv/intelfpgadrv.cpp
 create mode 100644 lite/backends/intelfpga/lldrv/intelfpgadrv.h
 create mode 100644 lite/backends/intelfpga/lldrv/utils.cpp
 create mode 100644 lite/backends/intelfpga/lldrv/utils.h
 create mode 100644 lite/backends/intelfpga/target_wrapper.cpp
 create mode 100644 lite/backends/intelfpga/target_wrapper.h
 create mode 100755 lite/kernels/intelfpga/CMakeLists.txt
 create mode 100644 lite/kernels/intelfpga/conv_compute.cc
 create mode 100644 lite/kernels/intelfpga/conv_compute.h
 create mode 100644 lite/kernels/intelfpga/conv_depthwise.cc
 create mode 100644 lite/kernels/intelfpga/conv_depthwise.h
 create mode 100644 lite/kernels/intelfpga/conv_gemmlike.cc
 create mode 100644 lite/kernels/intelfpga/conv_gemmlike.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c4e12f0f25..9dd5a87d7ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ lite_option(LITE_WITH_TRAIN     "Enable training operators and kernels in lite"
 lite_option(LITE_WITH_OPENMP    "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL    "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA      "Enable FPGA support in lite" OFF)
+lite_option(LITE_WITH_INTELFPGA      "Enable IntelFPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE   "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a8065c5b0dc..d1467704ac9 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -173,6 +173,10 @@ if (LITE_WITH_FPGA)
 add_definitions("-DLITE_WITH_FPGA")
 endif()
 
+if (LITE_WITH_INTELFPGA)
+add_definitions("-DLITE_WITH_INTELFPGA")
+endif()
+
 if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 2be01753cce..b5f115ca973 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -81,6 +81,12 @@ function (lite_deps TARGET)
       set(deps ${deps} ${var})
     endforeach(var)
   endif()
+  
+  if (LITE_WITH_INTELFPGA)
+    foreach(var ${lite_deps_INTELFPGA_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
 
   if (LITE_WITH_NPU)
     foreach(var ${lite_deps_NPU_DEPS})
@@ -155,7 +161,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -171,6 +177,7 @@ function(lite_cc_library TARGET)
             ARM_DEPS ${args_ARM_DEPS}
             CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
@@ -207,7 +214,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -219,6 +226,7 @@ function(lite_cc_binary TARGET)
             CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
@@ -262,7 +270,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -282,6 +290,7 @@ function(lite_cc_test TARGET)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
@@ -318,6 +327,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
+set(intelfpga_kernels CACHE INTERNAL "intelfpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
@@ -346,7 +356,7 @@ endif()
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -431,6 +441,15 @@ function(add_kernel TARGET device level)
         endif()
         set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "INTELFPGA")
+        if (NOT LITE_WITH_INTELFPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(intelfpga_kernels "${intelfpga_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "BM")
         if (NOT LITE_WITH_BM)
             foreach(src ${args_SRCS})
@@ -514,6 +533,7 @@ function(add_kernel TARGET device level)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
@@ -540,7 +560,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -572,6 +592,7 @@ function(add_operator TARGET level)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
new file mode 100644
index 00000000000..b875ff80e5b
--- /dev/null
+++ b/docs/demo_guides/intel_fpga.md
@@ -0,0 +1,107 @@
+# PaddleLite使用IntelFPGA预测部署
+
+Paddle Lite支持基于arm的IntelFPGA c5的模型预测，提供armv7hf的交叉编译
+
+PaddleLite通过调用底层驱动实现对FPGA硬件的调度，以及对应的API接口。
+
+## Lite实现IntelFPGA简介
+
+Lite支持IntelFPGA作为后端硬件进行模型推理，其主要特性如下：
+
+- Lite中IntelFPGA的kernel均以FP32、NCHW的格式作为输入输出格式
+
+- 对于IntelFPGA暂不支持的kernel，均会切回ARM端运行，实现ARM+FPGA混合布署运行
+
+## 支持芯片
+- [Cyclone V](https://www.intel.cn/content/dam/altera-www/global/en_US/pdfs/literature/hb/cyclone-v/cv_51002.pdf)
+
+### 已支持（或部分支持）的Paddle算子
+
+- relu/relu6/leakyrelu
+- conv2d
+- depthwise_conv2d
+
+### 已支持的Paddle模型
+
+- [SSD_MobileNet_V1](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_coco_pretrained.tar)
+
+## 编译
+
+需要提前准备带有IntelFPGAdrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码
+
+CMAKE编译选项：
+
+- 设置`LITE_WITH_INTELFPGA=ON`和`LITE_WITH_ARM=ON`
+
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。
+
+示例如下：
+```shell
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_OPENMP=ON   \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=OFF \
+        -DLITE_WITH_INTELFPGA=ON \
+        -DARM_TARGET_OS=armlinux 
+    make publish_inference -j2
+```
+Lite提供FPGA编译脚本，位于lite/tools/build_intel_fpga.sh full_publish，在Lite根目录执行该脚本即可编译
+
+## 运行示例
+
+- **运行文件准备**
+
+下面以SSD模型为例，介绍如何使用C5MB/C5TB开发板实现模型运行
+
+```bash
+#打开串口调试工具，如Putty或SecureCRT，选择对应的调试串口，并设置串口属性，
+#波特率：115200，数据位：8，停止位：1，奇偶校验：无[主机上执行]
+#上电C5MB开发板，并在串口调试工具中登录
+awcloud login: root
+Password: #密码：Awcloud@123
+#进入/opt目录[开发板执行]
+cd /opt
+#在运行模型前需要加载FPGA驱动[开发板执行]
+insmod driver/IntelFPGAdrv.ko
+```
+
+- **使用IntelFPGA进行模型预测**
+
+```bash
+#以下命令均在开发板上运行，在开发板上已经部署了对应的输入图片，模型，驱动程序，执行程序等
+#运行SSD测试程序，输入图片为/opt/images/dog.jpg，输出图片为/opt/dog_result.jpg
+./run_ssd.sh
+```
+
+## 如何在Code中使用
+
+在Lite中使用IntelFPGA与ARM相似，具体的区别如下：
+
+- 由于IntelFPGA运行模式为FP32精度、NCHW布局，所以需要修改相应的`valid_place`
+
+代码示例：
+```cpp
+lite::Predictor predictor;
+std::vector<Place> valid_places(
+      {Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)},Place{TARGET(kARM)});
+
+predictor.Build(model_dir, "", "", valid_places);
+
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+//假设设置输入数据全为1
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+
+predictor.Run();
+auto* out = predictor.GetOutput(0);
+```
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index ab71bd44b41..73a921b4b20 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -177,7 +177,10 @@ if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
     set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()
-
+if(LITE_WITH_INTELFPGA)
+    set(light_api_deps ${light_api_deps} ${intelfpga_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${intelfpga_deps})
+endif()
 if(LITE_WITH_BM)
     set(light_api_deps ${light_api_deps} ${bm_deps})
     set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
@@ -209,6 +212,7 @@ list(LENGTH apu_kernels num_apu_kernels)
 list(LENGTH xpu_kernels num_xpu_kernels)
 list(LENGTH rknpu_kernels num_rknpu_kernels)
 list(LENGTH fpga_kernels num_fpga_kernels)
+list(LENGTH intelfpga_kernels num_intelfpga_kernels)
 list(LENGTH bm_kernels num_bm_kernels)
 list(LENGTH mlu_kernels num_mlu_kernels)
 list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels)
@@ -225,6 +229,7 @@ message(STATUS "Collected ${num_apu_kernels} APU kernels")
 message(STATUS "Collected ${num_xpu_kernels} XPU kernels")
 message(STATUS "Collected ${num_rknpu_kernels} RKNPU kernels")
 message(STATUS "Collected ${num_fpga_kernels} FPGA kernels")
+message(STATUS "Collected ${num_intelfpga_kernels} INTELFPGA kernels")
 message(STATUS "Collected ${num_bm_kernels} BM kernels")
 message(STATUS "Collected ${num_mlu_kernels} MLU kernels")
 message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels")
@@ -249,6 +254,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
                         CL_DEPS ${opencl_kernels}
                         FPGA_DEPS ${fpga_kernels}
+                        INTELFPGA_DEPS ${intelfpga_kernels}
                         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
@@ -272,6 +278,7 @@ lite_cc_library(light_api SRCS light_api.cc
         RKNPU_DEPS ${rknpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         BM_DEPS ${bm_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         MLU_DEPS ${mlu_kernels}
@@ -296,6 +303,7 @@ if(WITH_TESTING)
            RKNPU_DEPS ${rknpu_kernels}
            CL_DEPS ${opencl_kernels}
            FPGA_DEPS ${fpga_kernels}
+           INTELFPGA_DEPS ${intelfpga_kernels}
            BM_DEPS ${bm_kernels}
            MLU_DEPS ${mlu_kernels}
            IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
@@ -352,7 +360,7 @@ if(WITH_TESTING)
 endif()
 
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intelfpga_kernels})
 
     lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
        DEPS ${lite_model_test_DEPS}
@@ -451,6 +459,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         BM_DEPS ${bm_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -470,6 +479,7 @@ if(NOT WITH_COVERAGE)
         DEPS light_api program mir_passes paddle_api_light
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -480,6 +490,7 @@ if(NOT WITH_COVERAGE)
         X86_DEPS ${x86_kernels}
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         MLU_DEPS ${mlu_kernels}
@@ -524,6 +535,7 @@ if(NOT WITH_COVERAGE)
       CL_DEPS ${opencl_kernels}
       X86_DEPS ${x86_kernels}
       FPGA_DEPS ${fpga_kernels}
+      INTELFPGA_DEPS ${intelfpga_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
       IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
@@ -549,6 +561,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -566,6 +579,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -583,6 +597,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -599,6 +614,7 @@ if(NOT IOS)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -617,6 +633,7 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
 
@@ -631,6 +648,7 @@ if(NOT IOS)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        INTELFPGA_DEPS ${intelfpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 0ebf133f1c5..848bf47fc3f 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(apu)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
+add_subdirectory(intelfpga)
diff --git a/lite/backends/intelfpga/CMakeLists.txt b/lite/backends/intelfpga/CMakeLists.txt
new file mode 100644
index 00000000000..1ee8eccae05
--- /dev/null
+++ b/lite/backends/intelfpga/CMakeLists.txt
@@ -0,0 +1,23 @@
+if (NOT LITE_WITH_INTELFPGA)
+    return()
+endif()
+
+set(LITE_INTELFPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga")
+set(LITE_INTELFPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga/lldrv")
+
+message("intelfpga_path ${LITE_INTELFPGA_PATH}")
+file(GLOB INTELFPGA_CPP "${LITE_INTELFPGA_PATH}/*.cpp")
+file(GLOB LLDRV_CPP "${LITE_INTELFPGA_LLDRV_PATH}/*.cpp")
+message("intelfpga cpp: ${INTELFPGA_CPP}")
+set(INTELFPGA_ALL_CPP "")
+FOREACH(FILE_PATH ${LLDRV_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND INTELFPGA_ALL_CPP lldrv/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${INTELFPGA_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND INTELFPGA_ALL_CPP ${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+message("intelfpga src: ${INTELFPGA_ALL_CPP}")
+cc_library(kernel_intelfpga SRCS ${INTELFPGA_ALL_CPP})
+cc_library(intelfpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intelfpga)
diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp b/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp
new file mode 100644
index 00000000000..55e4bf92f0d
--- /dev/null
+++ b/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp
@@ -0,0 +1,192 @@
+/* Copyright (c) 2020 AWCloud. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+
+#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
+
+namespace paddle {
+namespace lite {
+namespace intelfpga {
+
+/// FD of intelfpga
+static int intelfpga_fd = -1;
+
+/// Memory blocks
+static struct intelfpga_memblk_s mb, ms, mi, mk, mo;
+
+int intelfpga_open() {
+  if (intelfpga_fd < 0) {
+    intelfpga_fd = open("/dev/intelfpgadrv0", O_RDWR);
+    if (intelfpga_fd < 0) {
+      return -1;
+    }
+    memset(&mb, 0, sizeof(mb));
+    memset(&ms, 0, sizeof(ms));
+    memset(&mi, 0, sizeof(mi));
+    memset(&mk, 0, sizeof(mk));
+    memset(&mo, 0, sizeof(mo));
+  }
+
+  return 0;
+}
+
+void intelfpga_close() {
+  if (intelfpga_fd < 0) return;
+
+  if (mb.addr) {
+    free(mb.addr);
+  }
+  if (ms.addr) {
+    free(ms.addr);
+  }
+  if (mi.addr) {
+    free(mi.addr);
+  }
+  if (mk.addr) {
+    free(mk.addr);
+  }
+  if (mo.addr) {
+    free(mo.addr);
+  }
+  close(intelfpga_fd);
+  intelfpga_fd = -1;
+}
+
+/// memory management;
+void* intelfpga_malloc(size_t size) { return malloc(size); }
+
+void intelfpga_free(void* ptr) { free(ptr); }
+
+void* intelfpga_mbias(size_t size) {
+  if (mb.addr) {
+    if (mb.size >= size) {
+      return mb.addr;
+    }
+    free(mb.addr);
+  }
+  mb.addr = malloc(size);
+  if (mb.addr) {
+    mb.size = size;
+  }
+  return mb.addr;
+}
+
+void* intelfpga_mscale(size_t size) {
+  if (ms.addr) {
+    if (ms.size >= size) {
+      return ms.addr;
+    }
+    free(ms.addr);
+  }
+  ms.addr = malloc(size);
+  if (ms.addr) {
+    ms.size = size;
+  }
+
+  return ms.addr;
+}
+
+void* intelfpga_minput(size_t size) {
+  if (mi.addr) {
+    if (mi.size >= size) {
+      return mi.addr;
+    }
+    free(mi.addr);
+  }
+  mi.addr = malloc(size);
+  if (mi.addr) {
+    mi.size = size;
+  }
+
+  return mi.addr;
+}
+
+void* intelfpga_mkernel(size_t size) {
+  if (mk.addr) {
+    if (mk.size >= size) {
+      return mk.addr;
+    }
+    free(mk.addr);
+  }
+  mk.addr = malloc(size);
+  if (mk.addr) {
+    mk.size = size;
+  }
+
+  return mk.addr;
+}
+
+void* intelfpga_moutput(size_t size) {
+  if (mo.addr) {
+    if (mo.size >= size) {
+      return mo.addr;
+    }
+    free(mo.addr);
+  }
+  mo.addr = malloc(size);
+  if (mo.addr) {
+    mo.size = size;
+  }
+
+  return mo.addr;
+}
+
+void intelfpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); }
+
+int intelfpga_info(struct intelfpga_info_s* args) {
+  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_INFO);
+
+  if (intelfpga_open()) return -1;
+
+  return ioctl(intelfpga_fd, cmd, args);
+}
+
+int intelfpga_conv(struct intelfpga_conv_s* args) {
+  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_CONV);
+
+  if (intelfpga_open()) return -1;
+
+  return ioctl(intelfpga_fd, cmd, args);
+}
+
+int intelfpga_pooling(struct intelfpga_pool_s* args) {
+  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_POOL);
+
+  if (intelfpga_open()) return -1;
+
+  return ioctl(intelfpga_fd, cmd, args);
+}
+
+int intelfpga_fullconnect(struct intelfpga_fcon_s* args) {
+  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_FCON);
+
+  if (intelfpga_open()) return -1;
+
+  return ioctl(intelfpga_fd, cmd, args);
+}
+
+}  // namespace intelfpga
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.h b/lite/backends/intelfpga/lldrv/intelfpgadrv.h
new file mode 100644
index 00000000000..f35c343e030
--- /dev/null
+++ b/lite/backends/intelfpga/lldrv/intelfpgadrv.h
@@ -0,0 +1,186 @@
+/* Copyright (c) 2020 AWCloud. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _LLDRV_INTELFPGA_H_
+#define _LLDRV_INTELFPGA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+
+namespace paddle {
+namespace lite {
+namespace intelfpga {
+
+// Activation type
+enum intelfpga_act_e {
+  ACT_NONE = 0,
+  ACT_RELU = 1,
+};
+
+// Device information
+struct intelfpga_info_s {
+  uint32_t ver;  // Version, 00.00.0000
+};
+
+struct intelfpga_reset_s {
+  uint32_t val;  // reset command, N/A
+};
+
+// Memory copy
+struct intelfpga_mcopy_s {
+  void* src;    // source address
+  void* dst;    // destination adddress
+  size_t size;  // size in bytes
+};
+
+// Memory block
+struct intelfpga_memblk_s {
+  void* addr;   // base address
+  size_t size;  // size in bytes
+};
+
+// Kernel
+struct intelfpga_kernel_s {
+  uint32_t kw;  // width
+  uint32_t kh;  // height
+  uint32_t ws;  // width stride(s)
+  uint32_t hs;  // height stride(s)
+};
+
+// Input parameters, nchw
+struct intelfpga_input_s {
+  uint32_t in;  // nbr of batch {1}
+  uint32_t ic;  // nbr of channels {1}
+  uint32_t iw;  // width
+  uint32_t ih;  // height
+  uint32_t pl;  // padding x in bytes {0}
+  uint32_t pr;  // padding x in bytes {0}
+  uint32_t pt;  // padding y in bytes {0}
+  uint32_t pb;  // padding y in bytes {0}
+  uint32_t dx;  // dilation for x {1}
+  uint32_t dy;  // dilation for y {1}
+};
+
+// Output parameters, nchw
+struct intelfpga_output_s {
+  uint32_t on;  // nbr of batch {1}
+  uint32_t oc;  // nbr of channels {1}
+  uint32_t ow;  // width
+  uint32_t oh;  // height
+};
+
+// Basic convolution
+struct intelfpga_conv_s {
+  uint32_t at;                  // activation type {0}, None=0, RELU=1
+  uint32_t ng;                  // nbr of groups {1}
+  int8_t* ia;                   // input address, INT8[N,Ci,Hi,Wi]
+  int8_t* ka;                   // kernel address, INT32[Co,Ci,Hk,Wk]
+  int32_t* ba;                  // bias address, INT32[Co,1]
+  int32_t* oa;                  // output address, INT32[N,Co,Ho,Wo]
+  struct intelfpga_input_s i;   // input
+  struct intelfpga_kernel_s k;  // kernel
+  struct intelfpga_output_s o;  // output
+};
+
+// Pooling convolution
+struct intelfpga_pool_s {
+  uint32_t gp : 1;         // global pooling {0}
+  uint32_t pm : 1;         // pooling mode {0}, Max=0, AVG=1
+  uint32_t cm : 1;         // ceil mode {0}, ceil=0, floor=1
+  uint32_t ex : 1;         // exclusive {1}, if ignore padding in avg pooling
+  uint32_t reserved : 28;  // reserved {0}
+  int32_t* ia;             // input address, INT32[N,Ci,Hi,Wi]
+  int32_t* oa;             // output address, INT32[N,Ci,Ho,Wo]
+  struct intelfpga_input_s i;   // input
+  struct intelfpga_kernel_s k;  // kernel
+  struct intelfpga_output_s o;  // output
+};
+
+// Full connection
+struct intelfpga_fcon_s {
+  uint32_t at;  // activation type {0}, None=0, RELU=1
+  int8_t* ia;   // input address, INT8[M,K]
+  int8_t* ka;   // kernel address, INT8[K,N]
+  int32_t* ba;  // bias address, INT32[M,N]
+  int32_t* oa;  // output address, INT32[M,N] = ia[M,K] * wa[K,N] + ba[M,N]
+  int m, n, k;  // dims
+};
+
+// Regisger access
+struct intelfpga_creg_s {
+  uint32_t addr;
+  uint32_t data;
+};
+
+#define INTELFPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4)
+
+/* Ioctls */
+#define INTELFPGA_IOCTL_MAKE(cmd) (_IO(INTELFPGA_MAGIC_ID, cmd))
+#define INTELFPGA_IOCTL_GET(cmd) (_IOC_NR(cmd))
+#define INTELFPGA_IOCTL_VALID(cmd) \
+  ((_IOC_TYPE(cmd) == INTELFPGA_MAGIC_ID) ? 1 : 0)
+
+#define INTELFPGA_CMD_INFO 0x00   // struct intelfpga_info_s
+#define INTELFPGA_CMD_RESET 0x01  // struct intelfpga_reset_s
+
+#define INTELFPGA_CMD_MCOPY 0x10  // struct intelfpga_mcopy_s
+#define INTELFPGA_CMD_INVAL 0x11  // struct intelfpga_cache_s
+#define INTELFPGA_CMD_FLUSH 0x12  // struct intelfpga_cache_s
+
+#define INTELFPGA_CMD_CONV 0x20  // struct intelfpga_conv_s
+#define INTELFPGA_CMD_POOL 0x21  // struct intelfpga_pool_s
+#define INTELFPGA_CMD_FCON 0x22  // struct intelfpga_fcon_s
+
+#define INTELFPGA_CMD_REGRD 0xC0  // struct intelfpga_register_s
+#define INTELFPGA_CMD_REGWR 0xC1  // struct intelfpga_register_s
+
+//---------------------------------------------------------------------------
+
+// device open/close
+int intelfpga_open();
+void intelfpga_close();
+
+void intelfpga_reset(struct intelfpga_reset_s* args);
+
+// memory management
+void* intelfpga_malloc(size_t size);
+void intelfpga_free(void* ptr);
+
+void* intelfpga_mbias(size_t size);
+void* intelfpga_mscale(size_t size);
+void* intelfpga_minput(size_t size);
+void* intelfpga_mkernel(size_t size);
+void* intelfpga_moutput(size_t size);
+
+void intelfpga_copy(void* dst, void* src, int size);
+int intelfpga_flush(void* addr, size_t size);
+int intelfpga_invalidate(void* addr, size_t size);
+
+// device information
+int intelfpga_info(struct intelfpga_info_s* args);
+
+// convolution process
+int intelfpga_conv(struct intelfpga_conv_s* args);
+int intelfpga_pooling(struct intelfpga_pool_s* args);
+int intelfpga_fullconnect(struct intelfpga_fcon_s* args);
+
+}  // namespace intelfpga
+}  // namespace lite
+}  // namespace paddle
+
+#endif  // _LLDRV_INTELFPGA_H_
diff --git a/lite/backends/intelfpga/lldrv/utils.cpp b/lite/backends/intelfpga/lldrv/utils.cpp
new file mode 100644
index 00000000000..0ad6fb9836d
--- /dev/null
+++ b/lite/backends/intelfpga/lldrv/utils.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2020 AWCloud. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory.h>
+#include <algorithm>
+#include <fstream>
+#include <string>
+
+#include "lite/backends/intelfpga/lldrv/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace intelfpga {
+
+float find_max(const float* data, int size) {
+  float max = 0.0;
+
+  for (size_t i = 0; i < size; ++i) {
+    float value = data[i];
+    float abs = value > 0.0 ? value : -value;
+
+    max = std::max(max, abs);
+  }
+
+  return max;
+}
+
+void quantize_s8(const float* src, int8_t* dst, int size, float factor) {
+  float fdata;
+
+  for (size_t i = 0; i < size; i++) {
+    fdata = src[i] * factor;
+
+    if (fdata < 0.0) {
+      fdata -= 0.5;
+    } else {
+      fdata += 0.5;
+    }
+
+    dst[i] = (int8_t)fdata;
+  }
+}
+
+void quantize_s32(const float* src, int32_t* dst, int size, float factor) {
+  float fdata;
+
+  for (size_t i = 0; i < size; i++) {
+    fdata = src[i] * factor;
+
+    if (fdata < 0.0) {
+      fdata -= 0.5;
+    } else {
+      fdata += 0.5;
+    }
+
+    dst[i] = (int32_t)fdata;
+  }
+}
+}  // namespace intelfpga
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/intelfpga/lldrv/utils.h b/lite/backends/intelfpga/lldrv/utils.h
new file mode 100644
index 00000000000..d3883cc3e07
--- /dev/null
+++ b/lite/backends/intelfpga/lldrv/utils.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 AWCloud. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <cwchar>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace intelfpga {
+
+float find_max(const float* data, int size);
+
+void quantize_s8(const float* src, int8_t* dst, int size, float factor);
+void quantize_s32(const float* src, int32_t* dst, int size, float factor);
+
+}  // namespace intelfpga
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/intelfpga/target_wrapper.cpp b/lite/backends/intelfpga/target_wrapper.cpp
new file mode 100644
index 00000000000..c2de3ff6bfb
--- /dev/null
+++ b/lite/backends/intelfpga/target_wrapper.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/intelfpga/target_wrapper.h"
+#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+
+void* TargetWrapper<TARGET(kIntelFPGA)>::Malloc(size_t size) {
+  return intelfpga::intelfpga_malloc(size);
+}
+
+void TargetWrapper<TARGET(kIntelFPGA)>::Free(void* ptr) {
+  intelfpga::intelfpga_free(ptr);
+}
+
+void TargetWrapper<TARGET(kIntelFPGA)>::MemcpySync(void* dst,
+                                                   const void* src,
+                                                   size_t size,
+                                                   IoDirection dir) {
+  memcpy(dst, src, size);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/intelfpga/target_wrapper.h b/lite/backends/intelfpga/target_wrapper.h
new file mode 100644
index 00000000000..ee60348f10f
--- /dev/null
+++ b/lite/backends/intelfpga/target_wrapper.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+template <>
+class TargetWrapper<TARGET(kIntelFPGA)> {
+ public:
+  using stream_t = int;
+  using event_t = int;
+
+  static size_t num_devices() { return 0; }
+  static size_t maximum_stream() { return 0; }
+
+  static void CreateStream(stream_t* stream) {}
+  static void DestroyStream(const stream_t& stream) {}
+
+  static void CreateEvent(event_t* event) {}
+  static void DestroyEvent(const event_t& event) {}
+
+  static void RecordEvent(const event_t& event) {}
+  static void SyncEvent(const event_t& event) {}
+
+  static void StreamSync(const stream_t& stream) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+  static void MemcpyAsync(void* dst,
+                          const void* src,
+                          size_t size,
+                          IoDirection dir,
+                          const stream_t& stream) {
+    MemcpySync(dst, src, size, dir);
+  }
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 9e03ca693b7..18ed6d7f9a8 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -8,6 +8,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
   XPU_DEPS target_wrapper_xpu
   CL_DEPS cl_target_wrapper
   FPGA_DEPS fpga_target_wrapper
+  INTELFPGA_DEPS intelfpga_target_wrapper
   BM_DEPS target_wrapper_bm
   MLU_DEPS target_wrapper_mlu)
 
diff --git a/lite/core/context.h b/lite/core/context.h
index dca559f06ae..e8789d16ea7 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -65,6 +65,7 @@ using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
 using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
 using ImaginationNNAContext = Context<TargetType::kImaginationNNA>;
+using IntelFPGAContext = Context<TargetType::kIntelFPGA>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -327,6 +328,21 @@ class Context<TargetType::kFPGA> {
 };
 #endif
 
+#ifdef LITE_WITH_INTELFPGA
+// TODO(xbeu): add needed implementation to context
+template <>
+class Context<TargetType::kIntelFPGA> {
+ public:
+  void InitOnce() {}
+
+  IntelFPGAContext& operator=(const IntelFPGAContext& ctx) {}
+
+  void CopySharedTo(IntelFPGAContext* ctx) {}
+
+  std::string name() const { return "IntelFPGAContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_MLU
 template <>
 class Context<TargetType::kMLU> {
@@ -547,6 +563,13 @@ class ContextScheduler {
             &ctx->As<FPGAContext>());
         break;
 #endif
+#ifdef LITE_WITH_INTELFPGA
+      case TARGET(kIntelFPGA):
+        kernel_contexts_[TargetType::kIntelFPGA]
+            .As<IntelFPGAContext>()
+            .CopySharedTo(&ctx->As<IntelFPGAContext>());
+        break;
+#endif
 #ifdef LITE_WITH_BM
       case TARGET(kBM):
         kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
@@ -602,6 +625,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_FPGA
     InitContext<TargetType::kFPGA, FPGAContext>();
 #endif
+#ifdef LITE_WITH_INTELFPGA
+    InitContext<TargetType::kIntelFPGA, IntelFPGAContext>();
+#endif
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
 #endif
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 79cce9a0243..52649cdc520 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -16,3 +16,4 @@ add_subdirectory(bm)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
+add_subdirectory(intelfpga)
diff --git a/lite/kernels/intelfpga/CMakeLists.txt b/lite/kernels/intelfpga/CMakeLists.txt
new file mode 100755
index 00000000000..4f2fbe6d5d2
--- /dev/null
+++ b/lite/kernels/intelfpga/CMakeLists.txt
@@ -0,0 +1,9 @@
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTELFPGA))
+    return()
+endif()
+
+set(intelfpga_deps intelfpga_target_wrapper kernel_intelfpga)
+
+add_kernel(conv_depthwise_intelfpga INTELFPGA basic SRCS conv_depthwise.cc DEPS ${intelfpga_deps})
+add_kernel(conv_gemmlike_intelfpga INTELFPGA basic SRCS conv_gemmlike.cc DEPS ${intelfpga_deps})
+add_kernel(conv_compute_intelfpga INTELFPGA basic SRCS conv_compute.cc DEPS ${intelfpga_deps} conv_depthwise_intelfpga conv_gemmlike_intelfpga)
diff --git a/lite/kernels/intelfpga/conv_compute.cc b/lite/kernels/intelfpga/conv_compute.cc
new file mode 100644
index 00000000000..e0c75367bd2
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_compute.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/intelfpga/conv_compute.h"
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/intelfpga/conv_depthwise.h"
+#include "lite/kernels/intelfpga/conv_gemmlike.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+#define PARAM_INIT                                                           \
+  auto& param = this->Param<param_t>();                                      \
+  auto w_dims = param.filter->dims();                                        \
+  auto paddings = *param.paddings;                                           \
+  auto dilations = *param.dilations;                                         \
+  int ic = w_dims[1] * param.groups;                                         \
+  int oc = w_dims[0];                                                        \
+  int kh = w_dims[2];                                                        \
+  int kw = w_dims[3];                                                        \
+  int pad_h = paddings[0];                                                   \
+  int pad_w = paddings[2];                                                   \
+  int stride = param.strides[0];                                             \
+  int sh = param.strides[1];                                                 \
+  int sw = param.strides[0];                                                 \
+  int chin = param.x->dims()[1];                                             \
+  int hin = param.x->dims()[2];                                              \
+  int win = param.x->dims()[3];                                              \
+  int chout = param.output->dims()[1];                                       \
+  int hout = param.output->dims()[2];                                        \
+  int wout = param.output->dims()[3];                                        \
+  bool pads_equal =                                                          \
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));        \
+  bool pads_all_equal = (pads_equal && pad_h == pad_w);                      \
+  bool ks_equal = (sw == sh) && (kw == kh);                                  \
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);             \
+  bool kps_equal = (pad_h == pad_w) && ks_equal;                             \
+  bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2); \
+  bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2); \
+  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+
+template <>
+void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
+  PARAM_INIT
+  /// select conv impl
+  if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) {
+    impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
+    // VLOG(3) << "invoking dw conv";
+  } else {
+    impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
+    // VLOG(3) << "invoking gemm like conv";
+  }
+  if (!arm_cxt_) {
+    arm_cxt_ = ContextScheduler::Global().NewContext(TargetType::kARM);
+  }
+  impl_->SetContext(std::move(arm_cxt_));
+  impl_->SetParam(param);
+  impl_->PrepareForRun();
+  is_first_epoch_ = false;
+}
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::intelfpga::ConvCompute<PRECISION(kFloat),
+                                                      PRECISION(kFloat)>
+    ConvFp32;
+
+REGISTER_LITE_KERNEL(conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindPaddleOpVersion("conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindPaddleOpVersion("depthwise_conv2d", 1)
+    .Finalize();
diff --git a/lite/kernels/intelfpga/conv_compute.h b/lite/kernels/intelfpga/conv_compute.h
new file mode 100644
index 00000000000..a9fd135e431
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_compute.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+
+template <PrecisionType Ptype, PrecisionType OutType>
+class ConvCompute : public KernelLite<TARGET(kIntelFPGA), Ptype> {
+ public:
+  virtual void PrepareForRun();
+
+  virtual void ReInitWhenNeeded() {
+    CHECK(impl_);
+    impl_->ReInitWhenNeeded();
+  }
+
+  virtual void Run() {
+    CHECK(impl_);
+    impl_->Run();
+  }
+
+  ~ConvCompute() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+
+ private:
+  using param_t = operators::ConvParam;
+  std::unique_ptr<KernelContext> arm_cxt_{nullptr};
+  KernelLite<TARGET(kARM), Ptype>* impl_{nullptr};
+};
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_depthwise.cc b/lite/kernels/intelfpga/conv_depthwise.cc
new file mode 100644
index 00000000000..80cab07e848
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_depthwise.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/intelfpga/conv_depthwise.h"
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {}
+
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  CHECK(this->ctx_);
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto w_dims = param.filter->dims();
+  auto kw = w_dims[3];
+  auto channel = w_dims[0];
+  auto hin = param.x->dims()[2];
+  auto win = param.x->dims()[3];
+  auto paddings = *param.paddings;
+  // select dw conv kernel
+  if (kw == 3) {
+    bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
+    if (pads_less && paddings[0] == paddings[2] &&
+        (paddings[0] == 0 || paddings[0] == 1)) {
+      flag_trans_weights_ = false;
+    } else {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      flag_trans_weights_ = true;
+    }
+    impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+  } else if (kw == 5) {
+    auto strides = param.strides;
+    if ((strides[0] == 1 && strides[1] == 1) ||
+        (strides[0] == 2 && strides[1] == 2)) {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      flag_trans_weights_ = true;
+      impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
+    } else {
+      LOG(FATAL)
+          << "5x5 depthwise conv only support stride == 1 or stride == 2";
+    }
+  } else {
+    LOG(FATAL) << "this type dw conv not impl";
+  }
+}
+
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(this->ctx_);
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  const auto* i_data = param.x->data<float>();
+  const auto* w_data = flag_trans_weights_ ? weights_.data<float>()
+                                           : param.filter->data<float>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  if (flag_trans_bias_) {
+    b_data = bias_.data<float>();
+  }
+  auto* o_data = param.output->mutable_data<float>();
+
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  int iw = x_dims[3];  // nchw
+  int ih = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int oc = o_dims[1];
+
+  impl_(i_data,
+        o_data,
+        bs,
+        oc,
+        oh,
+        ow,
+        ic,
+        ih,
+        iw,
+        w_data,
+        b_data,
+        param,
+        &ctx,
+        w_scale_.data());
+}
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_depthwise.h b/lite/kernels/intelfpga/conv_depthwise.h
new file mode 100644
index 00000000000..3f9bf657e02
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_depthwise.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+
+template <PrecisionType Ptype, PrecisionType Otype>
+class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  typedef void (*conv_dw_impl)(const void* din,
+                               void* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const void* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx,
+                               const float* scale);
+  DepthwiseConv() = default;
+  ~DepthwiseConv() {}
+  virtual void PrepareForRun();
+  virtual void ReInitWhenNeeded();
+  virtual void Run();
+
+ private:
+  using param_t = operators::ConvParam;
+  Tensor weights_;
+  Tensor bias_;
+  DDim last_shape_;
+  bool flag_trans_weights_{false};
+  bool flag_trans_bias_{false};
+  conv_dw_impl impl_{nullptr};
+  std::vector<float> w_scale_;
+};
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_gemmlike.cc b/lite/kernels/intelfpga/conv_gemmlike.cc
new file mode 100644
index 00000000000..2131d2c032f
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_gemmlike.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/intelfpga/conv_gemmlike.h"
+#include <vector>
+#include "lite/backends/arm/math/gemm_prepacked_int8.h"
+#include "lite/backends/arm/math/packed_sgemm.h"
+#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
+#include "lite/backends/intelfpga/lldrv/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+
+template <>
+void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
+  ReInitWhenNeeded();
+}
+
+template <>
+void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
+  auto weights = param.filter->data<float>();
+  if (flag_trans_weights_) {
+    weights = weights_.data<float>();
+  }
+  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  if (flag_trans_bias_) {
+    b_data = bias_.data<float>();
+  }
+  auto i_data = param.x->data<float>();
+  auto w_data = param.filter->data<float>();
+  auto o_data = param.output->mutable_data<float>();
+  auto i_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  int iw, ih, ic, bs, ow, oh, oc;
+  float alpha;
+
+  iw = i_dims[3];  // nchw
+  ih = i_dims[2];
+  ic = i_dims[1];
+  bs = i_dims[0];
+  oh = o_dims[2];
+  ow = o_dims[3];
+  oc = o_dims[1];
+
+  int kh = w_dims[2];
+  int kw = w_dims[3];
+
+  if (kh > 1 && kw > 1) {
+    int i, j, il, kl, ol, l, m, n, k;
+    lite::intelfpga::intelfpga_conv_s conv;
+
+    conv.at = static_cast<uint32_t>(param.activation_param.active_type);
+    if (conv.at == 4) {
+      alpha = param.activation_param.Leaky_relu_alpha;
+    }
+    conv.ng = param.groups;
+
+    conv.i.in = i_dims[0];
+    conv.i.ic = i_dims[1];
+    conv.i.ih = i_dims[2];
+    conv.i.iw = i_dims[3];
+    conv.i.pl = paddings[2];  // left
+    conv.i.pr = paddings[3];  // right
+    conv.i.pt = paddings[0];  // top
+    conv.i.pb = paddings[1];  // bottom
+    conv.i.dy = dilations[0];
+    conv.i.dx = dilations[1];
+
+    conv.k.kh = w_dims[2];
+    conv.k.kw = w_dims[3];
+    conv.k.hs = param.strides[0];
+    conv.k.ws = param.strides[1];
+
+    conv.o.on = o_dims[0];
+    conv.o.oc = o_dims[1];
+    conv.o.oh = o_dims[2];
+    conv.o.ow = o_dims[3];
+
+    il = conv.i.in * conv.i.ic * conv.i.ih * conv.i.iw;
+    kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw;
+    ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow;
+    conv.ia = static_cast<int8_t*>(
+        lite::intelfpga::intelfpga_minput(il * sizeof(int8_t)));
+    conv.ka = static_cast<int8_t*>(
+        lite::intelfpga::intelfpga_mkernel(kl * sizeof(int8_t)));
+    conv.oa = static_cast<int32_t*>(
+        lite::intelfpga::intelfpga_moutput(ol * sizeof(int32_t)));
+    if (conv.ia && conv.ka && conv.oa) {
+      float fd = lite::intelfpga::find_max(i_data, il);
+      float fw = lite::intelfpga::find_max(w_data, kl);
+
+      fd = 127.0 / fd;
+      fw = 127.0 / fw;
+
+      // y = 127.0 / fmax
+      // y = x * scale;
+      lite::intelfpga::quantize_s8(i_data, conv.ia, il, fd);
+      lite::intelfpga::quantize_s8(w_data, conv.ka, kl, fw);
+
+      // perform conv2d
+      if (lite::intelfpga::intelfpga_conv(&conv)) {
+        std::cout << "intelfpga_conv error" << std::endl;
+      }
+      // Convert int32 back to fp32, [n,c,h,w]
+      // 1. y = x / scale
+      // 2. y = x + b
+      // 3. y = f(x)
+      int hw = conv.o.oh * conv.o.ow;
+      for (i = 0; i < conv.o.on; i++) {
+        for (j = 0; j < conv.o.oc; j++) {
+          m = i * conv.o.oc + j;
+          n = m * hw;
+          for (l = 0; l < hw; l++) {
+            k = n + l;
+            o_data[k] = static_cast<float>(conv.oa[k] / fd / fw);
+            if (b_data) o_data[k] += b_data[j];
+            if (conv.at == 1) {  // relu
+              o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0;
+            } else if (conv.at == 2) {  // relu6
+              o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0;
+              o_data[k] = o_data[k] > 6.0 ? 6.0 : o_data[k];
+            } else if (conv.at == 4) {  // leakyRelu
+              if (o_data[k] < 0.0) o_data[k] = o_data[k] * alpha;
+            }
+          }
+        }
+      }
+    }
+  } else {
+    if (flag_1x1gemm_) {
+      lite::arm::math::conv1x1s1_gemm(i_data,
+                                      o_data,
+                                      bs,
+                                      oc,
+                                      oh,
+                                      ow,
+                                      ic,
+                                      ih,
+                                      iw,
+                                      weights,
+                                      b_data,
+                                      param,
+                                      &ctx);
+    } else {
+      lite::arm::math::conv_im2col_gemm(i_data,
+                                        o_data,
+                                        bs,
+                                        oc,
+                                        oh,
+                                        ow,
+                                        ic,
+                                        ih,
+                                        iw,
+                                        weights,
+                                        b_data,
+                                        param,
+                                        &ctx);
+    }
+  }
+}
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_gemmlike.h b/lite/kernels/intelfpga/conv_gemmlike.h
new file mode 100644
index 00000000000..812271010c7
--- /dev/null
+++ b/lite/kernels/intelfpga/conv_gemmlike.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace intelfpga {
+
+template <PrecisionType Ptype, PrecisionType Otype>
+class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  GemmLikeConv() = default;
+  ~GemmLikeConv() {}
+
+  virtual void ReInitWhenNeeded() {
+    auto& param = this->template Param<param_t>();
+    CHECK(this->ctx_);
+    auto& ctx = this->ctx_->template As<ARMContext>();
+    auto x_dims = param.x->dims();
+    auto w_dims = param.filter->dims();
+    auto o_dims = param.output->dims();
+    if (last_shape_ == x_dims) {
+      return;
+    }
+
+    int iw = x_dims[3];  // nchw
+    int ih = x_dims[2];
+    int ic = x_dims[1];
+    int ow = o_dims[3];
+    int oh = o_dims[2];
+    int oc = o_dims[1];
+    int kw = w_dims[3];
+    int kh = w_dims[2];
+
+    auto paddings = *param.paddings;
+    auto dilations = *param.dilations;
+
+    int sw = param.strides[1];
+    int sh = param.strides[0];
+    int pw = paddings[2];
+    int ph = paddings[0];
+    int dw = dilations[1];
+    int dh = dilations[0];
+
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
+    int m = oc / param.groups;
+    int k = ic * kh * kw / param.groups;
+    int n = oh * ow;
+
+    bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
+    bool ks_equal = (sw == sh) && (kw == kh);
+    //! select conv gemmlike kernel
+    if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) {
+      //! 1x1s1p0 gemmlike conv
+      flag_1x1gemm_ = true;
+    } else {
+      //! im2col gemmlike conv
+      flag_1x1gemm_ = false;
+      workspace_size_ = k * n * sizeof(float);
+    }
+    if (!flag_trans_weights_ && n > 1 && m > 1) {
+      lite::arm::math::trans_gemm_weights<Ptype>(
+          *(param.filter), weights_, param.groups, &ctx);
+      flag_trans_weights_ = true;
+    } else if (n == 1 || m == 1) {
+      flag_trans_weights_ = false;
+    }
+    last_shape_ = x_dims;
+  }
+  virtual void PrepareForRun();
+  virtual void Run();
+
+ protected:
+  using param_t = operators::ConvParam;
+  DDim last_shape_;
+  std::vector<float> w_scale_;
+  bool flag_1x1gemm_{true};
+  bool flag_trans_weights_{false};
+  bool flag_trans_bias_{false};
+  Tensor weights_;
+  Tensor bias_;
+  int workspace_size_{0};
+};
+
+}  // namespace intelfpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle

From 99756c67660e5dbfefe3ac8f44999e6ea83a4e99 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 17 Mar 2021 08:04:49 +0000
Subject: [PATCH 02/19] test=develop

---
 lite/tools/build_intel_fpga.sh | 324 +++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100755 lite/tools/build_intel_fpga.sh

diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh
new file mode 100755
index 00000000000..ef647df315c
--- /dev/null
+++ b/lite/tools/build_intel_fpga.sh
@@ -0,0 +1,324 @@
+#!/bin/bash
+set -e
+
+#####################################################################################################
+# 1. global variables, you can change them according to your requirements
+#####################################################################################################
+# armv7hf.
+ARCH=armv7hf
+# gcc or clang, default gcc.
+TOOLCHAIN=gcc
+# ON or OFF, default OFF.
+WITH_EXTRA=ON
+# controls whether to compile python lib, default is OFF.
+WITH_PYTHON=OFF
+PY_VERSION=""
+# controls whether to compile cv functions into lib, default is OFF.
+WITH_CV=OFF
+# controls whether to print log information, default is ON.
+WITH_LOG=OFF
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
+# options of striping lib according to input model.
+WITH_STRIP=OFF
+OPTMODEL_DIR=""
+# options of compiling OPENCL lib.
+WITH_OPENCL=OFF
+# options of compiling intel fpga.
+WITH_INTELFPGA=ON
+# options of adding training ops
+WITH_TRAIN=OFF
+# num of threads used during compiling..
+readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
+#####################################################################################################
+
+#####################################################################################################
+# 2. local variables, these variables should not be changed.
+#####################################################################################################
+# url that stores third-party zip file to accelerate third-paty lib installation
+readonly THIRDPARTY_TAR=https://paddlelite-data.bj.bcebos.com/third_party_libs/third-party-ea5576.tar.gz
+# absolute path of Paddle-Lite.
+readonly workspace=$PWD/$(dirname $0)/../../
+# basic options for linux compiling.
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                            -DLITE_WITH_ARM=ON \
+                            -DLITE_WITH_X86=OFF \
+                            -DARM_TARGET_OS=armlinux \
+                            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+                            -DWITH_TESTING=OFF"
+# mutable options for linux compiling.
+function init_cmake_mutable_options {
+    cmake_mutable_options="-DARM_TARGET_ARCH_ABI=$ARCH \
+                        -DARM_TARGET_LANG=$TOOLCHAIN \
+                        -DLITE_BUILD_EXTRA=$WITH_EXTRA \
+                        -DLITE_WITH_PYTHON=$WITH_PYTHON \
+                        -DPY_VERSION=$PY_VERSION \
+                        -DLITE_WITH_CV=$WITH_CV \
+                        -DLITE_WITH_LOG=$WITH_LOG \
+                        -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
+                        -DLITE_BUILD_TAILOR=$WITH_STRIP \
+                        -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+                        -DLITE_WITH_OPENCL=$WITH_OPENCL \
+                        -DLITE_WITH_INTELFPGA=$WITH_INTELFPGA \
+                        -DLITE_WITH_TRAIN=$WITH_TRAIN"
+}
+#####################################################################################################
+
+####################################################################################################
+# 3. functions of prepare workspace before compiling
+####################################################################################################
+
+# 3.1 generate `__generated_code__.cc`, which is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    local root_dir=$1
+    local build_dir=$2
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code
+    mkdir -p ${GEN_CODE_PATH_PREFIX}
+    touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug
+    mkdir -p ${DEBUG_TOOL_PATH_PREFIX}
+    cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/
+}
+
+# 3.2 prepare source code of opencl lib
+# here we bundle all cl files into a cc file to bundle all opencl kernels into a single lib
+function prepare_opencl_source_code {
+    local root_dir=$1
+    local build_dir=$2
+    # in build directory
+    # Prepare opencl_kernels_source.cc file
+    GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl
+    rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel
+    mkdir -p ${GEN_CODE_PATH_OPENCL}
+    touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc 
+}
+
+# 3.3 prepare third_party libraries for compiling
+# here we store third_party libraries into Paddle-Lite/third-party
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-ea5576.tar.gz ]; then
+        rm -rf $workspace/third-party
+        if [ ! -f $workspace/third-party-ea5576.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-ea5576.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+####################################################################################################
+
+####################################################################################################
+# 4. compiling functions
+####################################################################################################
+
+# 4.1 function of tiny_publish compiling
+# here we only compile light_api lib
+function make_tiny_publish_so {
+    is_tiny=${1:-ON}
+    if [ "$WITH_PYTHON" = "ON" -a "$is_tiny" = "ON" ]; then
+        echo "Warning: build full_publish to use python."
+        is_tiny=OFF
+    fi
+    if [ "$WITH_TRAIN" = "ON" -a "$is_tiny" = "ON" ]; then
+        echo "Warning: build full_publish to add training ops."
+        is_tiny=OFF
+    fi
+    if [ "$BUILD_TAILOR" = "ON" -a "$OPTMODEL_DIR" = "" ]; then
+        echo "Error: set OPTMODEL_DIR if BUILD_TAILOR is ON."
+    fi
+
+    if [ "$is_tiny" = "OFF" ]; then
+        prepare_thirdparty
+    fi
+
+    build_dir=$workspace/build.lite.linux.$ARCH.$TOOLCHAIN
+    if [ "${WITH_OPENCL}" = "ON" ]; then
+       build_dir=${build_dir}.opencl
+    fi
+
+    if [ -d $build_dir ]; then
+        rm -rf $build_dir
+    fi
+    mkdir -p $build_dir
+    cd $build_dir
+
+    prepare_workspace $workspace $build_dir
+
+    if [ "${WITH_OPENCL}" = "ON" ]; then
+       prepare_opencl_source_code $workspace $build_dir
+    fi
+    if [ "${WITH_STRIP}" == "ON" ]; then
+        WITH_EXTRA=ON
+    fi
+
+    init_cmake_mutable_options
+    cmake $workspace \
+       ${CMAKE_COMMON_OPTIONS} \
+       ${cmake_mutable_options} \
+       -DLITE_ON_TINY_PUBLISH=$is_tiny
+
+    if [ "${WITH_OPENCL}" = "ON" ]; then
+       make opencl_clhpp -j$NUM_PROC 
+    fi
+
+    make publish_inference -j$NUM_PROC
+    cd - > /dev/null
+}
+####################################################################################################
+
+# 4.2 function of full_publish compiling
+# here we compile both light_api lib and full_api lib
+function make_full_publish_so {
+    make_tiny_publish_so OFF
+}
+####################################################################################################
+
+function print_usage {
+    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "| Methods of compiling Padddle-Lite Linux library:                                                                                                     |"
+    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "|  compile linux library: (armv8, gcc)                                                                                                                 |"
+    echo -e "|     ./lite/tools/build_linux.sh                                                                                                                      |"
+    echo -e "|  print help information:                                                                                                                             |"
+    echo -e "|     ./lite/tools/build_linux.sh help                                                                                                                 |"
+    echo -e "|                                                                                                                                                      |"
+    echo -e "|  optional argument:                                                                                                                                  |"
+    echo -e "|     --arch: (armv8|armv7hf|armv7), default is armv8                                                                                                  |"
+    echo -e "|     --toolchain: (gcc|clang), defalut is gcc                                                                                                         |"
+    echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP), default is OFF  |"
+    echo -e "|     --with_python: (OFF|ON); controls whether to build python lib or whl, default is OFF                                                             |"
+    echo -e "|     --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None                                                         |"
+    echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                                           |"
+    echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                                            |"
+    echo -e "|                                                                                                                                                      |"
+    echo -e "|  arguments of striping lib according to input model:                                                                                                 |"
+    echo -e "|     ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                                                |"
+    echo -e "|     --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF                                                    |"
+    echo -e "|     --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library                                                  |"
+    echo -e "|  detailed information about striping lib:  https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html                           |"
+    echo -e "|                                                                                                                                                      |"
+    echo -e "|  arguments of opencl library compiling:                                                                                                              |"
+    echo -e "|     ./lite/tools/build_linux.sh --with_opencl=ON                                                                                                     |"
+    echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                                              |"
+    echo -e "|                                                                                                                                                      |"
+    echo -e "|  arguments of rockchip npu library compiling:                                                                                                        |"
+    echo -e "|     ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath                                                |"
+    echo -e "|     --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF                                                  |"
+    echo -e "|     --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library                                            |"
+    echo -e "|             you can download rockchip NPU SDK from:  https://github.com/airockchip/rknpu_ddk.git                                                     |"
+    echo -e "|  detailed information about Paddle-Lite RKNPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html                           |"
+    echo -e "|                                                                                                                                                      |"
+    echo -e "|  arguments of baidu xpu library compiling:                                                                                                           |"
+    echo -e "|     ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath                                                         |"
+    echo -e "|     --with_baidu_xpu: (OFF|ON); controls whether to compile lib for baidu_xpu, default is OFF                                                        |"
+    echo -e "|     --baidu_xpu_sdk_root: (path to baidu_xpu DDK file) required when compiling baidu_xpu library                                                     |"
+    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
+    echo
+}
+
+function main {
+    if [ -z "$1" ]; then
+        # compiling result contains light_api lib only, recommanded.
+        make_tiny_publish_so
+        exit 0
+    fi
+
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            # armv8 or armv7hf or armv7, default armv8
+            --arch=*)
+                ARCH="${i#*=}"
+                shift
+                ;;
+            # gcc or clang, default gcc
+            --toolchain=*)
+                TOOLCHAIN="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_extra=*)
+                WITH_EXTRA="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_python=*)
+                WITH_PYTHON="${i#*=}"
+                shift
+                ;;
+            # 2.7 or 3.5 or 3.7, default is None
+            --python_version=*)
+                PY_VERSION="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_cv=*)
+                WITH_CV="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default ON
+            --with_log=*)
+                WITH_LOG="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_strip=*)
+                BUILD_TAILOR="${i#*=}"
+                shift
+                ;;
+            # string, absolute path to optimized model dir
+            --opt_model_dir=*)
+                OPTMODEL_DIR="${i#*=}"
+                shift
+                ;;
+            # compiling lib which can operate on opencl and cpu.
+            --with_opencl=*)
+                WITH_OPENCL="${i#*=}"
+                shift
+                ;;
+            # compiling lib which can operate on intel fpga.
+            --with_intelfpga=*)
+                WITH_INTELFPGA="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_train=*)
+                WITH_TRAIN="${i#*=}"
+                shift
+                ;;
+            # compiling result contains both light_api and cxx_api lib.
+            full_publish)
+                make_full_publish_so
+                exit 0
+                ;;
+            # print help info
+            help)
+                print_usage
+                exit 0
+                ;;
+            # unknown option
+            *)
+                echo "Error: unsupported argument \"${i#*=}\""
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+    # compiling result contains light_api lib only, recommanded.
+    make_tiny_publish_so
+}
+
+main $@

From 680a34891c16429061cac6e24fdc3336023d1d39 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 17 Mar 2021 09:42:50 +0000
Subject: [PATCH 03/19] test=develop

---
 lite/api/paddle_place.cc | 9 ++++++---
 lite/api/paddle_place.h  | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index a04d632bdc4..d47f2a92a6f 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -82,7 +82,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "rknpu",
                                               "apu",
                                               "huawei_ascend_npu",
-                                              "imagination_nna"};
+                                              "imagination_nna",
+                                              "intelfpga"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -129,7 +130,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kRKNPU",
                                               "kAPU",
                                               "kHuaweiAscendNPU",
-                                              "kImaginationNNA"};
+                                              "kImaginationNNA",
+                                              "kIntelFPGA"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -190,7 +192,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kRKNPU),
                                                TARGET(kFPGA),
                                                TARGET(kHuaweiAscendNPU),
-                                               TARGET(kImaginationNNA)});
+                                               TARGET(kImaginationNNA),
+                                               TARGET(kIntelFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index f0563f63006..62d82398744 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -59,7 +59,8 @@ enum class TargetType : int {
   kAPU = 13,
   kHuaweiAscendNPU = 14,
   kImaginationNNA = 15,
-  NUM = 16,  // number of fields.
+  kIntelFPGA = 16,
+  NUM = 17,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,

From a14cbfe803c195aa26e68b24cc74ba86c74a027e Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Thu, 18 Mar 2021 01:58:32 +0000
Subject: [PATCH 04/19] test=develop

---
 CMakeLists.txt                                |   2 +-
 cmake/configure.cmake                         |   4 +-
 cmake/lite.cmake                              |  34 ++---
 docs/demo_guides/intel_fpga.md                |  12 +-
 lite/api/CMakeLists.txt                       |  38 +++---
 lite/api/paddle_place.cc                      |   2 +-
 lite/backends/CMakeLists.txt                  |   2 +-
 lite/backends/intel_fpga/CMakeLists.txt       |  23 ++++
 .../lldrv/intelfpgadrv.cpp                    |  76 +++++------
 .../lldrv/intelfpgadrv.h                      | 122 +++++++++---------
 .../{intelfpga => intel_fpga}/lldrv/utils.cpp |   6 +-
 .../{intelfpga => intel_fpga}/lldrv/utils.h   |   4 +-
 .../target_wrapper.cpp                        |   8 +-
 .../target_wrapper.h                          |   0
 lite/backends/intelfpga/CMakeLists.txt        |  23 ----
 lite/core/CMakeLists.txt                      |   2 +-
 lite/core/context.h                           |   6 +-
 lite/kernels/CMakeLists.txt                   |   2 +-
 lite/kernels/intel_fpga/CMakeLists.txt        |   9 ++
 .../{intelfpga => intel_fpga}/conv_compute.cc |  14 +-
 .../{intelfpga => intel_fpga}/conv_compute.h  |   4 +-
 .../conv_depthwise.cc                         |   6 +-
 .../conv_depthwise.h                          |   4 +-
 .../conv_gemmlike.cc                          |  30 ++---
 .../{intelfpga => intel_fpga}/conv_gemmlike.h |   4 +-
 lite/kernels/intelfpga/CMakeLists.txt         |   9 --
 lite/tools/build_intel_fpga.sh                |   8 +-
 27 files changed, 227 insertions(+), 227 deletions(-)
 create mode 100644 lite/backends/intel_fpga/CMakeLists.txt
 rename lite/backends/{intelfpga => intel_fpga}/lldrv/intelfpgadrv.cpp (58%)
 rename lite/backends/{intelfpga => intel_fpga}/lldrv/intelfpgadrv.h (50%)
 rename lite/backends/{intelfpga => intel_fpga}/lldrv/utils.cpp (93%)
 rename lite/backends/{intelfpga => intel_fpga}/lldrv/utils.h (94%)
 rename lite/backends/{intelfpga => intel_fpga}/target_wrapper.cpp (85%)
 rename lite/backends/{intelfpga => intel_fpga}/target_wrapper.h (100%)
 delete mode 100644 lite/backends/intelfpga/CMakeLists.txt
 create mode 100755 lite/kernels/intel_fpga/CMakeLists.txt
 rename lite/kernels/{intelfpga => intel_fpga}/conv_compute.cc (92%)
 rename lite/kernels/{intelfpga => intel_fpga}/conv_compute.h (96%)
 rename lite/kernels/{intelfpga => intel_fpga}/conv_depthwise.cc (97%)
 rename lite/kernels/{intelfpga => intel_fpga}/conv_depthwise.h (97%)
 rename lite/kernels/{intelfpga => intel_fpga}/conv_gemmlike.cc (86%)
 rename lite/kernels/{intelfpga => intel_fpga}/conv_gemmlike.h (98%)
 delete mode 100755 lite/kernels/intelfpga/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dd5a87d7ee..12deaf69752 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,7 +99,7 @@ lite_option(LITE_WITH_TRAIN     "Enable training operators and kernels in lite"
 lite_option(LITE_WITH_OPENMP    "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL    "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA      "Enable FPGA support in lite" OFF)
-lite_option(LITE_WITH_INTELFPGA      "Enable IntelFPGA support in lite" OFF)
+lite_option(LITE_WITH_INTEL_FPGA      "Enable Intel FPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE   "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index d1467704ac9..3e25a41a3ed 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -173,8 +173,8 @@ if (LITE_WITH_FPGA)
 add_definitions("-DLITE_WITH_FPGA")
 endif()
 
-if (LITE_WITH_INTELFPGA)
-add_definitions("-DLITE_WITH_INTELFPGA")
+if (LITE_WITH_INTEL_FPGA)
+add_definitions("-DLITE_WITH_INTEL_FPGA")
 endif()
 
 if (LITE_WITH_BM)
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index b5f115ca973..40d75eb530b 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -82,8 +82,8 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
   
-  if (LITE_WITH_INTELFPGA)
-    foreach(var ${lite_deps_INTELFPGA_DEPS})
+  if (LITE_WITH_INTEL_FPGA)
+    foreach(var ${lite_deps_INTEL_FPGA_DEPS})
       set(deps ${deps} ${var})
     endforeach(var)
   endif()
@@ -161,7 +161,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -177,7 +177,7 @@ function(lite_cc_library TARGET)
             ARM_DEPS ${args_ARM_DEPS}
             CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
-            INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
+            INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
@@ -214,7 +214,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -226,7 +226,7 @@ function(lite_cc_binary TARGET)
             CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
-            INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
+            INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             APU_DEPS ${args_APU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
@@ -270,7 +270,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -290,7 +290,7 @@ function(lite_cc_test TARGET)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
-              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
+              INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
@@ -327,7 +327,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
-set(intelfpga_kernels CACHE INTERNAL "intelfpga kernels")
+set(intel_fpga_kernels CACHE INTERNAL "intel_fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
@@ -356,7 +356,7 @@ endif()
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -441,14 +441,14 @@ function(add_kernel TARGET device level)
         endif()
         set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
-    if ("${device}" STREQUAL "INTELFPGA")
-        if (NOT LITE_WITH_INTELFPGA)
+    if ("${device}" STREQUAL "INTEL_FPGA")
+        if (NOT LITE_WITH_INTEL_FPGA)
             foreach(src ${args_SRCS})
                 file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
             endforeach()
             return()
         endif()
-        set(intelfpga_kernels "${intelfpga_kernels};${TARGET}" CACHE INTERNAL "")
+        set(intel_fpga_kernels "${intel_fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "BM")
         if (NOT LITE_WITH_BM)
@@ -533,7 +533,7 @@ function(add_kernel TARGET device level)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
-              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
+              INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
@@ -560,7 +560,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -592,7 +592,7 @@ function(add_operator TARGET level)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
-              INTELFPGA_DEPS ${args_INTELFPGA_DEPS}
+              INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS}
               NPU_DEPS ${args_NPU_DEPS}
               APU_DEPS ${args_APU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index b875ff80e5b..b76920bd134 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -1,6 +1,6 @@
 # PaddleLite使用IntelFPGA预测部署
 
-Paddle Lite支持基于arm的IntelFPGA c5的模型预测，提供armv7hf的交叉编译
+Paddle Lite支持基于arm的IntelFPGA C5的模型预测，提供armv7hf的交叉编译
 
 PaddleLite通过调用底层驱动实现对FPGA硬件的调度，以及对应的API接口。
 
@@ -27,11 +27,11 @@ Lite支持IntelFPGA作为后端硬件进行模型推理，其主要特性如下
 
 ## 编译
 
-需要提前准备带有IntelFPGAdrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码
+需要提前准备带有intelfpgadrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码
 
 CMAKE编译选项：
 
-- 设置`LITE_WITH_INTELFPGA=ON`和`LITE_WITH_ARM=ON`
+- 设置`LITE_WITH_INTEL_FPGA=ON`和`LITE_WITH_ARM=ON`
 
 其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。
 
@@ -47,11 +47,11 @@ CMAKE编译选项：
         -DLITE_WITH_OPENMP=ON   \
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=OFF \
-        -DLITE_WITH_INTELFPGA=ON \
+        -DLITE_WITH_INTEL_FPGA=ON \
         -DARM_TARGET_OS=armlinux 
     make publish_inference -j2
 ```
-Lite提供FPGA编译脚本，位于lite/tools/build_intel_fpga.sh full_publish，在Lite根目录执行该脚本即可编译
+Lite提供IntelFPGA编译脚本，位于lite/tools/build_intel_fpga.sh full_publish，在Lite根目录执行该脚本即可编译
 
 ## 运行示例
 
@@ -68,7 +68,7 @@ Password: #密码：Awcloud@123
 #进入/opt目录[开发板执行]
 cd /opt
 #在运行模型前需要加载FPGA驱动[开发板执行]
-insmod driver/IntelFPGAdrv.ko
+insmod driver/intelfpgadrv.ko
 ```
 
 - **使用IntelFPGA进行模型预测**
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 73a921b4b20..64b68cc0c02 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -177,9 +177,9 @@ if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
     set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()
-if(LITE_WITH_INTELFPGA)
-    set(light_api_deps ${light_api_deps} ${intelfpga_deps})
-    set(cxx_api_deps ${cxx_api_deps} ${intelfpga_deps})
+if(LITE_WITH_INTEL_FPGA)
+    set(light_api_deps ${light_api_deps} ${intel_fpga_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps})
 endif()
 if(LITE_WITH_BM)
     set(light_api_deps ${light_api_deps} ${bm_deps})
@@ -212,7 +212,7 @@ list(LENGTH apu_kernels num_apu_kernels)
 list(LENGTH xpu_kernels num_xpu_kernels)
 list(LENGTH rknpu_kernels num_rknpu_kernels)
 list(LENGTH fpga_kernels num_fpga_kernels)
-list(LENGTH intelfpga_kernels num_intelfpga_kernels)
+list(LENGTH intel_fpga_kernels num_intel_fpga_kernels)
 list(LENGTH bm_kernels num_bm_kernels)
 list(LENGTH mlu_kernels num_mlu_kernels)
 list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels)
@@ -229,7 +229,7 @@ message(STATUS "Collected ${num_apu_kernels} APU kernels")
 message(STATUS "Collected ${num_xpu_kernels} XPU kernels")
 message(STATUS "Collected ${num_rknpu_kernels} RKNPU kernels")
 message(STATUS "Collected ${num_fpga_kernels} FPGA kernels")
-message(STATUS "Collected ${num_intelfpga_kernels} INTELFPGA kernels")
+message(STATUS "Collected ${num_intel_fpga_kernels} INTEL_FPGA kernels")
 message(STATUS "Collected ${num_bm_kernels} BM kernels")
 message(STATUS "Collected ${num_mlu_kernels} MLU kernels")
 message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels")
@@ -254,7 +254,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
                         CL_DEPS ${opencl_kernels}
                         FPGA_DEPS ${fpga_kernels}
-                        INTELFPGA_DEPS ${intelfpga_kernels}
+                        INTEL_FPGA_DEPS ${intel_fpga_kernels}
                         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
@@ -278,7 +278,7 @@ lite_cc_library(light_api SRCS light_api.cc
         RKNPU_DEPS ${rknpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         BM_DEPS ${bm_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         MLU_DEPS ${mlu_kernels}
@@ -303,7 +303,7 @@ if(WITH_TESTING)
            RKNPU_DEPS ${rknpu_kernels}
            CL_DEPS ${opencl_kernels}
            FPGA_DEPS ${fpga_kernels}
-           INTELFPGA_DEPS ${intelfpga_kernels}
+           INTEL_FPGA_DEPS ${intel_fpga_kernels}
            BM_DEPS ${bm_kernels}
            MLU_DEPS ${mlu_kernels}
            IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
@@ -360,7 +360,7 @@ if(WITH_TESTING)
 endif()
 
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intelfpga_kernels})
+    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intel_fpga_kernels})
 
     lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
        DEPS ${lite_model_test_DEPS}
@@ -459,7 +459,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         BM_DEPS ${bm_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -479,7 +479,7 @@ if(NOT WITH_COVERAGE)
         DEPS light_api program mir_passes paddle_api_light
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -490,7 +490,7 @@ if(NOT WITH_COVERAGE)
         X86_DEPS ${x86_kernels}
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         BM_DEPS ${bm_kernels}
         MLU_DEPS ${mlu_kernels}
@@ -535,7 +535,7 @@ if(NOT WITH_COVERAGE)
       CL_DEPS ${opencl_kernels}
       X86_DEPS ${x86_kernels}
       FPGA_DEPS ${fpga_kernels}
-      INTELFPGA_DEPS ${intelfpga_kernels}
+      INTEL_FPGA_DEPS ${intel_fpga_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
       IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
@@ -561,7 +561,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -579,7 +579,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -597,7 +597,7 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -614,7 +614,7 @@ if(NOT IOS)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
@@ -633,7 +633,7 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
 
@@ -648,7 +648,7 @@ if(NOT IOS)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        INTELFPGA_DEPS ${intelfpga_kernels}
+        INTEL_FPGA_DEPS ${intel_fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels}
         HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index d47f2a92a6f..8853baae836 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -83,7 +83,7 @@ const std::string& TargetToStr(TargetType target) {
                                               "apu",
                                               "huawei_ascend_npu",
                                               "imagination_nna",
-                                              "intelfpga"};
+                                              "intel_fpga"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 848bf47fc3f..7c05e6138f1 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -12,4 +12,4 @@ add_subdirectory(apu)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
-add_subdirectory(intelfpga)
+add_subdirectory(intel_fpga)
diff --git a/lite/backends/intel_fpga/CMakeLists.txt b/lite/backends/intel_fpga/CMakeLists.txt
new file mode 100644
index 00000000000..c47a33be007
--- /dev/null
+++ b/lite/backends/intel_fpga/CMakeLists.txt
@@ -0,0 +1,23 @@
+if (NOT LITE_WITH_INTEL_FPGA)
+    return()
+endif()
+
+set(LITE_INTEL_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga")
+set(LITE_INTEL_FPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga/lldrv")
+
+message("intel_fpga_path ${LITE_INTEL_FPGA_PATH}")
+file(GLOB INTEL_FPGA_CPP "${LITE_INTEL_FPGA_PATH}/*.cpp")
+file(GLOB LLDRV_CPP "${LITE_INTEL_FPGA_LLDRV_PATH}/*.cpp")
+message("intel_fpga cpp: ${INTEL_FPGA_CPP}")
+set(INTEL_FPGA_ALL_CPP "")
+FOREACH(FILE_PATH ${LLDRV_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND INTEL_FPGA_ALL_CPP lldrv/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${INTELFPGA_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND INTEL_FPGA_ALL_CPP ${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+message("intel_fpga src: ${INTEL_FPGA_ALL_CPP}")
+cc_library(kernel_intel_fpga SRCS ${INTEL_FPGA_ALL_CPP})
+cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intel_fpga)
diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
similarity index 58%
rename from lite/backends/intelfpga/lldrv/intelfpgadrv.cpp
rename to lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
index 55e4bf92f0d..cc188e483cf 100644
--- a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp
+++ b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
@@ -24,22 +24,22 @@ limitations under the License. */
 #include <map>
 #include <utility>
 
-#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
+#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
 
 namespace paddle {
 namespace lite {
-namespace intelfpga {
+namespace intel_fpga {
 
-/// FD of intelfpga
-static int intelfpga_fd = -1;
+/// FD of intel_fpga
+static int intel_fpga_fd = -1;
 
 /// Memory blocks
-static struct intelfpga_memblk_s mb, ms, mi, mk, mo;
+static struct intel_fpga_memblk_s mb, ms, mi, mk, mo;
 
-int intelfpga_open() {
-  if (intelfpga_fd < 0) {
-    intelfpga_fd = open("/dev/intelfpgadrv0", O_RDWR);
-    if (intelfpga_fd < 0) {
+int intel_fpga_open() {
+  if (intel_fpga_fd < 0) {
+    intel_fpga_fd = open("/dev/intelfpgadrv0", O_RDWR);
+    if (intel_fpga_fd < 0) {
       return -1;
     }
     memset(&mb, 0, sizeof(mb));
@@ -52,8 +52,8 @@ int intelfpga_open() {
   return 0;
 }
 
-void intelfpga_close() {
-  if (intelfpga_fd < 0) return;
+void intel_fpga_close() {
+  if (intel_fpga_fd < 0) return;
 
   if (mb.addr) {
     free(mb.addr);
@@ -70,16 +70,16 @@ void intelfpga_close() {
   if (mo.addr) {
     free(mo.addr);
   }
-  close(intelfpga_fd);
-  intelfpga_fd = -1;
+  close(intel_fpga_fd);
+  intel_fpga_fd = -1;
 }
 
 /// memory management;
-void* intelfpga_malloc(size_t size) { return malloc(size); }
+void* intel_fpga_malloc(size_t size) { return malloc(size); }
 
-void intelfpga_free(void* ptr) { free(ptr); }
+void intel_fpga_free(void* ptr) { free(ptr); }
 
-void* intelfpga_mbias(size_t size) {
+void* intel_fpga_mbias(size_t size) {
   if (mb.addr) {
     if (mb.size >= size) {
       return mb.addr;
@@ -93,7 +93,7 @@ void* intelfpga_mbias(size_t size) {
   return mb.addr;
 }
 
-void* intelfpga_mscale(size_t size) {
+void* intel_fpga_mscale(size_t size) {
   if (ms.addr) {
     if (ms.size >= size) {
       return ms.addr;
@@ -108,7 +108,7 @@ void* intelfpga_mscale(size_t size) {
   return ms.addr;
 }
 
-void* intelfpga_minput(size_t size) {
+void* intel_fpga_minput(size_t size) {
   if (mi.addr) {
     if (mi.size >= size) {
       return mi.addr;
@@ -123,7 +123,7 @@ void* intelfpga_minput(size_t size) {
   return mi.addr;
 }
 
-void* intelfpga_mkernel(size_t size) {
+void* intel_fpga_mkernel(size_t size) {
   if (mk.addr) {
     if (mk.size >= size) {
       return mk.addr;
@@ -138,7 +138,7 @@ void* intelfpga_mkernel(size_t size) {
   return mk.addr;
 }
 
-void* intelfpga_moutput(size_t size) {
+void* intel_fpga_moutput(size_t size) {
   if (mo.addr) {
     if (mo.size >= size) {
       return mo.addr;
@@ -153,40 +153,40 @@ void* intelfpga_moutput(size_t size) {
   return mo.addr;
 }
 
-void intelfpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); }
+void intel_fpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); }
 
-int intelfpga_info(struct intelfpga_info_s* args) {
-  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_INFO);
+int intel_fpga_info(struct intel_fpga_info_s* args) {
+  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_INFO);
 
-  if (intelfpga_open()) return -1;
+  if (intel_fpga_open()) return -1;
 
-  return ioctl(intelfpga_fd, cmd, args);
+  return ioctl(intel_fpga_fd, cmd, args);
 }
 
-int intelfpga_conv(struct intelfpga_conv_s* args) {
-  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_CONV);
+int intel_fpga_conv(struct intel_fpga_conv_s* args) {
+  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_CONV);
 
-  if (intelfpga_open()) return -1;
+  if (intel_fpga_open()) return -1;
 
-  return ioctl(intelfpga_fd, cmd, args);
+  return ioctl(intel_fpga_fd, cmd, args);
 }
 
-int intelfpga_pooling(struct intelfpga_pool_s* args) {
-  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_POOL);
+int intel_fpga_pooling(struct intel_fpga_pool_s* args) {
+  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_POOL);
 
-  if (intelfpga_open()) return -1;
+  if (intel_fpga_open()) return -1;
 
-  return ioctl(intelfpga_fd, cmd, args);
+  return ioctl(intel_fpga_fd, cmd, args);
 }
 
-int intelfpga_fullconnect(struct intelfpga_fcon_s* args) {
-  int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_FCON);
+int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args) {
+  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_FCON);
 
-  if (intelfpga_open()) return -1;
+  if (intel_fpga_open()) return -1;
 
-  return ioctl(intelfpga_fd, cmd, args);
+  return ioctl(intel_fpga_fd, cmd, args);
 }
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.h b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h
similarity index 50%
rename from lite/backends/intelfpga/lldrv/intelfpgadrv.h
rename to lite/backends/intel_fpga/lldrv/intelfpgadrv.h
index f35c343e030..0a162e7af9d 100644
--- a/lite/backends/intelfpga/lldrv/intelfpgadrv.h
+++ b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef _LLDRV_INTELFPGA_H_
-#define _LLDRV_INTELFPGA_H_
+#ifndef _LLDRV_INTEL_FPGA_H_
+#define _LLDRV_INTEL_FPGA_H_
 
 #pragma once
 
@@ -24,38 +24,38 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
-namespace intelfpga {
+namespace intel_fpga {
 
 // Activation type
-enum intelfpga_act_e {
+enum intel_fpga_act_e {
   ACT_NONE = 0,
   ACT_RELU = 1,
 };
 
 // Device information
-struct intelfpga_info_s {
+struct intel_fpga_info_s {
   uint32_t ver;  // Version, 00.00.0000
 };
 
-struct intelfpga_reset_s {
+struct intel_fpga_reset_s {
   uint32_t val;  // reset command, N/A
 };
 
 // Memory copy
-struct intelfpga_mcopy_s {
+struct intel_fpga_mcopy_s {
   void* src;    // source address
   void* dst;    // destination adddress
   size_t size;  // size in bytes
 };
 
 // Memory block
-struct intelfpga_memblk_s {
+struct intel_fpga_memblk_s {
   void* addr;   // base address
   size_t size;  // size in bytes
 };
 
 // Kernel
-struct intelfpga_kernel_s {
+struct intel_fpga_kernel_s {
   uint32_t kw;  // width
   uint32_t kh;  // height
   uint32_t ws;  // width stride(s)
@@ -63,7 +63,7 @@ struct intelfpga_kernel_s {
 };
 
 // Input parameters, nchw
-struct intelfpga_input_s {
+struct intel_fpga_input_s {
   uint32_t in;  // nbr of batch {1}
   uint32_t ic;  // nbr of channels {1}
   uint32_t iw;  // width
@@ -77,7 +77,7 @@ struct intelfpga_input_s {
 };
 
 // Output parameters, nchw
-struct intelfpga_output_s {
+struct intel_fpga_output_s {
   uint32_t on;  // nbr of batch {1}
   uint32_t oc;  // nbr of channels {1}
   uint32_t ow;  // width
@@ -85,20 +85,20 @@ struct intelfpga_output_s {
 };
 
 // Basic convolution
-struct intelfpga_conv_s {
-  uint32_t at;                  // activation type {0}, None=0, RELU=1
-  uint32_t ng;                  // nbr of groups {1}
-  int8_t* ia;                   // input address, INT8[N,Ci,Hi,Wi]
-  int8_t* ka;                   // kernel address, INT32[Co,Ci,Hk,Wk]
-  int32_t* ba;                  // bias address, INT32[Co,1]
-  int32_t* oa;                  // output address, INT32[N,Co,Ho,Wo]
-  struct intelfpga_input_s i;   // input
-  struct intelfpga_kernel_s k;  // kernel
-  struct intelfpga_output_s o;  // output
+struct intel_fpga_conv_s {
+  uint32_t at;                   // activation type {0}, None=0, RELU=1
+  uint32_t ng;                   // nbr of groups {1}
+  int8_t* ia;                    // input address, INT8[N,Ci,Hi,Wi]
+  int8_t* ka;                    // kernel address, INT32[Co,Ci,Hk,Wk]
+  int32_t* ba;                   // bias address, INT32[Co,1]
+  int32_t* oa;                   // output address, INT32[N,Co,Ho,Wo]
+  struct intel_fpga_input_s i;   // input
+  struct intel_fpga_kernel_s k;  // kernel
+  struct intel_fpga_output_s o;  // output
 };
 
 // Pooling convolution
-struct intelfpga_pool_s {
+struct intel_fpga_pool_s {
   uint32_t gp : 1;         // global pooling {0}
   uint32_t pm : 1;         // pooling mode {0}, Max=0, AVG=1
   uint32_t cm : 1;         // ceil mode {0}, ceil=0, floor=1
@@ -106,13 +106,13 @@ struct intelfpga_pool_s {
   uint32_t reserved : 28;  // reserved {0}
   int32_t* ia;             // input address, INT32[N,Ci,Hi,Wi]
   int32_t* oa;             // output address, INT32[N,Ci,Ho,Wo]
-  struct intelfpga_input_s i;   // input
-  struct intelfpga_kernel_s k;  // kernel
-  struct intelfpga_output_s o;  // output
+  struct intel_fpga_input_s i;   // input
+  struct intel_fpga_kernel_s k;  // kernel
+  struct intel_fpga_output_s o;  // output
 };
 
 // Full connection
-struct intelfpga_fcon_s {
+struct intel_fpga_fcon_s {
   uint32_t at;  // activation type {0}, None=0, RELU=1
   int8_t* ia;   // input address, INT8[M,K]
   int8_t* ka;   // kernel address, INT8[K,N]
@@ -122,65 +122,65 @@ struct intelfpga_fcon_s {
 };
 
 // Regisger access
-struct intelfpga_creg_s {
+struct intel_fpga_creg_s {
   uint32_t addr;
   uint32_t data;
 };
 
-#define INTELFPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4)
+#define INTEL_FPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4)
 
 /* Ioctls */
-#define INTELFPGA_IOCTL_MAKE(cmd) (_IO(INTELFPGA_MAGIC_ID, cmd))
-#define INTELFPGA_IOCTL_GET(cmd) (_IOC_NR(cmd))
-#define INTELFPGA_IOCTL_VALID(cmd) \
-  ((_IOC_TYPE(cmd) == INTELFPGA_MAGIC_ID) ? 1 : 0)
+#define INTEL_FPGA_IOCTL_MAKE(cmd) (_IO(INTEL_FPGA_MAGIC_ID, cmd))
+#define INTEL_FPGA_IOCTL_GET(cmd) (_IOC_NR(cmd))
+#define INTEL_FPGA_IOCTL_VALID(cmd) \
+  ((_IOC_TYPE(cmd) == INTEL_FPGA_MAGIC_ID) ? 1 : 0)
 
-#define INTELFPGA_CMD_INFO 0x00   // struct intelfpga_info_s
-#define INTELFPGA_CMD_RESET 0x01  // struct intelfpga_reset_s
+#define INTEL_FPGA_CMD_INFO 0x00   // struct intel_fpga_info_s
+#define INTEL_FPGA_CMD_RESET 0x01  // struct intel_fpga_reset_s
 
-#define INTELFPGA_CMD_MCOPY 0x10  // struct intelfpga_mcopy_s
-#define INTELFPGA_CMD_INVAL 0x11  // struct intelfpga_cache_s
-#define INTELFPGA_CMD_FLUSH 0x12  // struct intelfpga_cache_s
+#define INTEL_FPGA_CMD_MCOPY 0x10  // struct intel_fpga_mcopy_s
+#define INTEL_FPGA_CMD_INVAL 0x11  // struct intel_fpga_cache_s
+#define INTEL_FPGA_CMD_FLUSH 0x12  // struct intel_fpga_cache_s
 
-#define INTELFPGA_CMD_CONV 0x20  // struct intelfpga_conv_s
-#define INTELFPGA_CMD_POOL 0x21  // struct intelfpga_pool_s
-#define INTELFPGA_CMD_FCON 0x22  // struct intelfpga_fcon_s
+#define INTEL_FPGA_CMD_CONV 0x20  // struct intel_fpga_conv_s
+#define INTEL_FPGA_CMD_POOL 0x21  // struct intel_fpga_pool_s
+#define INTEL_FPGA_CMD_FCON 0x22  // struct intel_fpga_fcon_s
 
-#define INTELFPGA_CMD_REGRD 0xC0  // struct intelfpga_register_s
-#define INTELFPGA_CMD_REGWR 0xC1  // struct intelfpga_register_s
+#define INTEL_FPGA_CMD_REGRD 0xC0  // struct intel_fpga_register_s
+#define INTEL_FPGA_CMD_REGWR 0xC1  // struct intel_fpga_register_s
 
 //---------------------------------------------------------------------------
 
 // device open/close
-int intelfpga_open();
-void intelfpga_close();
+int intel_fpga_open();
+void intel_fpga_close();
 
-void intelfpga_reset(struct intelfpga_reset_s* args);
+void intel_fpga_reset(struct intel_fpga_reset_s* args);
 
 // memory management
-void* intelfpga_malloc(size_t size);
-void intelfpga_free(void* ptr);
+void* intel_fpga_malloc(size_t size);
+void intel_fpga_free(void* ptr);
 
-void* intelfpga_mbias(size_t size);
-void* intelfpga_mscale(size_t size);
-void* intelfpga_minput(size_t size);
-void* intelfpga_mkernel(size_t size);
-void* intelfpga_moutput(size_t size);
+void* intel_fpga_mbias(size_t size);
+void* intel_fpga_mscale(size_t size);
+void* intel_fpga_minput(size_t size);
+void* intel_fpga_mkernel(size_t size);
+void* intel_fpga_moutput(size_t size);
 
-void intelfpga_copy(void* dst, void* src, int size);
-int intelfpga_flush(void* addr, size_t size);
-int intelfpga_invalidate(void* addr, size_t size);
+void intel_fpga_copy(void* dst, void* src, int size);
+int intel_fpga_flush(void* addr, size_t size);
+int intel_fpga_invalidate(void* addr, size_t size);
 
 // device information
-int intelfpga_info(struct intelfpga_info_s* args);
+int intel_fpga_info(struct intel_fpga_info_s* args);
 
 // convolution process
-int intelfpga_conv(struct intelfpga_conv_s* args);
-int intelfpga_pooling(struct intelfpga_pool_s* args);
-int intelfpga_fullconnect(struct intelfpga_fcon_s* args);
+int intel_fpga_conv(struct intel_fpga_conv_s* args);
+int intel_fpga_pooling(struct intel_fpga_pool_s* args);
+int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args);
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace lite
 }  // namespace paddle
 
-#endif  // _LLDRV_INTELFPGA_H_
+#endif  // _LLDRV_INTEL_FPGA_H_
diff --git a/lite/backends/intelfpga/lldrv/utils.cpp b/lite/backends/intel_fpga/lldrv/utils.cpp
similarity index 93%
rename from lite/backends/intelfpga/lldrv/utils.cpp
rename to lite/backends/intel_fpga/lldrv/utils.cpp
index 0ad6fb9836d..380e79e4d31 100644
--- a/lite/backends/intelfpga/lldrv/utils.cpp
+++ b/lite/backends/intel_fpga/lldrv/utils.cpp
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <fstream>
 #include <string>
 
-#include "lite/backends/intelfpga/lldrv/utils.h"
+#include "lite/backends/intel_fpga/lldrv/utils.h"
 
 namespace paddle {
 namespace lite {
-namespace intelfpga {
+namespace intel_fpga {
 
 float find_max(const float* data, int size) {
   float max = 0.0;
@@ -67,6 +67,6 @@ void quantize_s32(const float* src, int32_t* dst, int size, float factor) {
     dst[i] = (int32_t)fdata;
   }
 }
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/intelfpga/lldrv/utils.h b/lite/backends/intel_fpga/lldrv/utils.h
similarity index 94%
rename from lite/backends/intelfpga/lldrv/utils.h
rename to lite/backends/intel_fpga/lldrv/utils.h
index d3883cc3e07..ad8e403afd8 100644
--- a/lite/backends/intelfpga/lldrv/utils.h
+++ b/lite/backends/intel_fpga/lldrv/utils.h
@@ -21,13 +21,13 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
-namespace intelfpga {
+namespace intel_fpga {
 
 float find_max(const float* data, int size);
 
 void quantize_s8(const float* src, int8_t* dst, int size, float factor);
 void quantize_s32(const float* src, int32_t* dst, int size, float factor);
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/intelfpga/target_wrapper.cpp b/lite/backends/intel_fpga/target_wrapper.cpp
similarity index 85%
rename from lite/backends/intelfpga/target_wrapper.cpp
rename to lite/backends/intel_fpga/target_wrapper.cpp
index c2de3ff6bfb..0d567016c91 100644
--- a/lite/backends/intelfpga/target_wrapper.cpp
+++ b/lite/backends/intel_fpga/target_wrapper.cpp
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/intelfpga/target_wrapper.h"
-#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
+#include "lite/backends/intel_fpga/target_wrapper.h"
+#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
 namespace lite {
 
 void* TargetWrapper<TARGET(kIntelFPGA)>::Malloc(size_t size) {
-  return intelfpga::intelfpga_malloc(size);
+  return intel_fpga::intel_fpga_malloc(size);
 }
 
 void TargetWrapper<TARGET(kIntelFPGA)>::Free(void* ptr) {
-  intelfpga::intelfpga_free(ptr);
+  intel_fpga::intel_fpga_free(ptr);
 }
 
 void TargetWrapper<TARGET(kIntelFPGA)>::MemcpySync(void* dst,
diff --git a/lite/backends/intelfpga/target_wrapper.h b/lite/backends/intel_fpga/target_wrapper.h
similarity index 100%
rename from lite/backends/intelfpga/target_wrapper.h
rename to lite/backends/intel_fpga/target_wrapper.h
diff --git a/lite/backends/intelfpga/CMakeLists.txt b/lite/backends/intelfpga/CMakeLists.txt
deleted file mode 100644
index 1ee8eccae05..00000000000
--- a/lite/backends/intelfpga/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-if (NOT LITE_WITH_INTELFPGA)
-    return()
-endif()
-
-set(LITE_INTELFPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga")
-set(LITE_INTELFPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga/lldrv")
-
-message("intelfpga_path ${LITE_INTELFPGA_PATH}")
-file(GLOB INTELFPGA_CPP "${LITE_INTELFPGA_PATH}/*.cpp")
-file(GLOB LLDRV_CPP "${LITE_INTELFPGA_LLDRV_PATH}/*.cpp")
-message("intelfpga cpp: ${INTELFPGA_CPP}")
-set(INTELFPGA_ALL_CPP "")
-FOREACH(FILE_PATH ${LLDRV_CPP})
-    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
-    list(APPEND INTELFPGA_ALL_CPP lldrv/${FILE_NAME})
-ENDFOREACH(FILE_PATH)
-FOREACH(FILE_PATH ${INTELFPGA_CPP})
-    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
-    list(APPEND INTELFPGA_ALL_CPP ${FILE_NAME})
-ENDFOREACH(FILE_PATH)
-message("intelfpga src: ${INTELFPGA_ALL_CPP}")
-cc_library(kernel_intelfpga SRCS ${INTELFPGA_ALL_CPP})
-cc_library(intelfpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intelfpga)
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 18ed6d7f9a8..08881d6f523 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -8,7 +8,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
   XPU_DEPS target_wrapper_xpu
   CL_DEPS cl_target_wrapper
   FPGA_DEPS fpga_target_wrapper
-  INTELFPGA_DEPS intelfpga_target_wrapper
+  INTEL_FPGA_DEPS intel_fpga_target_wrapper
   BM_DEPS target_wrapper_bm
   MLU_DEPS target_wrapper_mlu)
 
diff --git a/lite/core/context.h b/lite/core/context.h
index e8789d16ea7..6455a8b972a 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -328,7 +328,7 @@ class Context<TargetType::kFPGA> {
 };
 #endif
 
-#ifdef LITE_WITH_INTELFPGA
+#ifdef LITE_WITH_INTEL_FPGA
 // TODO(xbeu): add needed implementation to context
 template <>
 class Context<TargetType::kIntelFPGA> {
@@ -563,7 +563,7 @@ class ContextScheduler {
             &ctx->As<FPGAContext>());
         break;
 #endif
-#ifdef LITE_WITH_INTELFPGA
+#ifdef LITE_WITH_INTEL_FPGA
       case TARGET(kIntelFPGA):
         kernel_contexts_[TargetType::kIntelFPGA]
             .As<IntelFPGAContext>()
@@ -625,7 +625,7 @@ class ContextScheduler {
 #ifdef LITE_WITH_FPGA
     InitContext<TargetType::kFPGA, FPGAContext>();
 #endif
-#ifdef LITE_WITH_INTELFPGA
+#ifdef LITE_WITH_INTEL_FPGA
     InitContext<TargetType::kIntelFPGA, IntelFPGAContext>();
 #endif
 #ifdef LITE_WITH_NPU
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 52649cdc520..343da4968ae 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -16,4 +16,4 @@ add_subdirectory(bm)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
-add_subdirectory(intelfpga)
+add_subdirectory(intel_fpga)
diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt
new file mode 100755
index 00000000000..276f4cb7e54
--- /dev/null
+++ b/lite/kernels/intel_fpga/CMakeLists.txt
@@ -0,0 +1,9 @@
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTEL_FPGA))
+    return()
+endif()
+
+set(intel_fpga_deps intel_fpga_target_wrapper kernel_intel_fpga)
+
+add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps})
+add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps})
+add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga)
diff --git a/lite/kernels/intelfpga/conv_compute.cc b/lite/kernels/intel_fpga/conv_compute.cc
similarity index 92%
rename from lite/kernels/intelfpga/conv_compute.cc
rename to lite/kernels/intel_fpga/conv_compute.cc
index e0c75367bd2..763ca83c7a2 100644
--- a/lite/kernels/intelfpga/conv_compute.cc
+++ b/lite/kernels/intel_fpga/conv_compute.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/intelfpga/conv_compute.h"
+#include "lite/kernels/intel_fpga/conv_compute.h"
 #include <utility>
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
-#include "lite/kernels/intelfpga/conv_depthwise.h"
-#include "lite/kernels/intelfpga/conv_gemmlike.h"
+#include "lite/kernels/intel_fpga/conv_depthwise.h"
+#include "lite/kernels/intel_fpga/conv_gemmlike.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 #define PARAM_INIT                                                           \
   auto& param = this->Param<param_t>();                                      \
   auto w_dims = param.filter->dims();                                        \
@@ -73,13 +73,13 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   is_first_epoch_ = false;
 }
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-typedef paddle::lite::kernels::intelfpga::ConvCompute<PRECISION(kFloat),
-                                                      PRECISION(kFloat)>
+typedef paddle::lite::kernels::intel_fpga::ConvCompute<PRECISION(kFloat),
+                                                       PRECISION(kFloat)>
     ConvFp32;
 
 REGISTER_LITE_KERNEL(conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def)
diff --git a/lite/kernels/intelfpga/conv_compute.h b/lite/kernels/intel_fpga/conv_compute.h
similarity index 96%
rename from lite/kernels/intelfpga/conv_compute.h
rename to lite/kernels/intel_fpga/conv_compute.h
index a9fd135e431..604972c2914 100644
--- a/lite/kernels/intelfpga/conv_compute.h
+++ b/lite/kernels/intel_fpga/conv_compute.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 
 template <PrecisionType Ptype, PrecisionType OutType>
 class ConvCompute : public KernelLite<TARGET(kIntelFPGA), Ptype> {
@@ -49,7 +49,7 @@ class ConvCompute : public KernelLite<TARGET(kIntelFPGA), Ptype> {
   KernelLite<TARGET(kARM), Ptype>* impl_{nullptr};
 };
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_depthwise.cc b/lite/kernels/intel_fpga/conv_depthwise.cc
similarity index 97%
rename from lite/kernels/intelfpga/conv_depthwise.cc
rename to lite/kernels/intel_fpga/conv_depthwise.cc
index 80cab07e848..96f5f3512a0 100644
--- a/lite/kernels/intelfpga/conv_depthwise.cc
+++ b/lite/kernels/intel_fpga/conv_depthwise.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/intelfpga/conv_depthwise.h"
+#include "lite/kernels/intel_fpga/conv_depthwise.h"
 #include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_impl.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 
 template <>
 void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {}
@@ -122,7 +122,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
         w_scale_.data());
 }
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_depthwise.h b/lite/kernels/intel_fpga/conv_depthwise.h
similarity index 97%
rename from lite/kernels/intelfpga/conv_depthwise.h
rename to lite/kernels/intel_fpga/conv_depthwise.h
index 3f9bf657e02..7f1e2f31b47 100644
--- a/lite/kernels/intelfpga/conv_depthwise.h
+++ b/lite/kernels/intel_fpga/conv_depthwise.h
@@ -25,7 +25,7 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 
 template <PrecisionType Ptype, PrecisionType Otype>
 class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
@@ -61,7 +61,7 @@ class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
   std::vector<float> w_scale_;
 };
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc
similarity index 86%
rename from lite/kernels/intelfpga/conv_gemmlike.cc
rename to lite/kernels/intel_fpga/conv_gemmlike.cc
index 2131d2c032f..bc9b6f68014 100644
--- a/lite/kernels/intelfpga/conv_gemmlike.cc
+++ b/lite/kernels/intel_fpga/conv_gemmlike.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/intelfpga/conv_gemmlike.h"
+#include "lite/kernels/intel_fpga/conv_gemmlike.h"
 #include <vector>
 #include "lite/backends/arm/math/gemm_prepacked_int8.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
-#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h"
-#include "lite/backends/intelfpga/lldrv/utils.h"
+#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
+#include "lite/backends/intel_fpga/lldrv/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 
 template <>
 void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
@@ -67,7 +67,7 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
 
   if (kh > 1 && kw > 1) {
     int i, j, il, kl, ol, l, m, n, k;
-    lite::intelfpga::intelfpga_conv_s conv;
+    lite::intel_fpga::intel_fpga_conv_s conv;
 
     conv.at = static_cast<uint32_t>(param.activation_param.active_type);
     if (conv.at == 4) {
@@ -100,26 +100,26 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
     kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw;
     ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow;
     conv.ia = static_cast<int8_t*>(
-        lite::intelfpga::intelfpga_minput(il * sizeof(int8_t)));
+        lite::intel_fpga::intel_fpga_minput(il * sizeof(int8_t)));
     conv.ka = static_cast<int8_t*>(
-        lite::intelfpga::intelfpga_mkernel(kl * sizeof(int8_t)));
+        lite::intel_fpga::intel_fpga_mkernel(kl * sizeof(int8_t)));
     conv.oa = static_cast<int32_t*>(
-        lite::intelfpga::intelfpga_moutput(ol * sizeof(int32_t)));
+        lite::intel_fpga::intel_fpga_moutput(ol * sizeof(int32_t)));
     if (conv.ia && conv.ka && conv.oa) {
-      float fd = lite::intelfpga::find_max(i_data, il);
-      float fw = lite::intelfpga::find_max(w_data, kl);
+      float fd = lite::intel_fpga::find_max(i_data, il);
+      float fw = lite::intel_fpga::find_max(w_data, kl);
 
       fd = 127.0 / fd;
       fw = 127.0 / fw;
 
       // y = 127.0 / fmax
       // y = x * scale;
-      lite::intelfpga::quantize_s8(i_data, conv.ia, il, fd);
-      lite::intelfpga::quantize_s8(w_data, conv.ka, kl, fw);
+      lite::intel_fpga::quantize_s8(i_data, conv.ia, il, fd);
+      lite::intel_fpga::quantize_s8(w_data, conv.ka, kl, fw);
 
       // perform conv2d
-      if (lite::intelfpga::intelfpga_conv(&conv)) {
-        std::cout << "intelfpga_conv error" << std::endl;
+      if (lite::intel_fpga::intel_fpga_conv(&conv)) {
+        std::cout << "intel_fpga_conv error" << std::endl;
       }
       // Convert int32 back to fp32, [n,c,h,w]
       // 1. y = x / scale
@@ -179,7 +179,7 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   }
 }
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/intelfpga/conv_gemmlike.h b/lite/kernels/intel_fpga/conv_gemmlike.h
similarity index 98%
rename from lite/kernels/intelfpga/conv_gemmlike.h
rename to lite/kernels/intel_fpga/conv_gemmlike.h
index 812271010c7..338a711983c 100644
--- a/lite/kernels/intelfpga/conv_gemmlike.h
+++ b/lite/kernels/intel_fpga/conv_gemmlike.h
@@ -26,7 +26,7 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace intelfpga {
+namespace intel_fpga {
 
 template <PrecisionType Ptype, PrecisionType Otype>
 class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
@@ -106,7 +106,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
   int workspace_size_{0};
 };
 
-}  // namespace intelfpga
+}  // namespace intel_fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/intelfpga/CMakeLists.txt b/lite/kernels/intelfpga/CMakeLists.txt
deleted file mode 100755
index 4f2fbe6d5d2..00000000000
--- a/lite/kernels/intelfpga/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTELFPGA))
-    return()
-endif()
-
-set(intelfpga_deps intelfpga_target_wrapper kernel_intelfpga)
-
-add_kernel(conv_depthwise_intelfpga INTELFPGA basic SRCS conv_depthwise.cc DEPS ${intelfpga_deps})
-add_kernel(conv_gemmlike_intelfpga INTELFPGA basic SRCS conv_gemmlike.cc DEPS ${intelfpga_deps})
-add_kernel(conv_compute_intelfpga INTELFPGA basic SRCS conv_compute.cc DEPS ${intelfpga_deps} conv_depthwise_intelfpga conv_gemmlike_intelfpga)
diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh
index ef647df315c..53b22b0b085 100755
--- a/lite/tools/build_intel_fpga.sh
+++ b/lite/tools/build_intel_fpga.sh
@@ -25,7 +25,7 @@ OPTMODEL_DIR=""
 # options of compiling OPENCL lib.
 WITH_OPENCL=OFF
 # options of compiling intel fpga.
-WITH_INTELFPGA=ON
+WITH_INTEL_FPGA=ON
 # options of adding training ops
 WITH_TRAIN=OFF
 # num of threads used during compiling..
@@ -59,7 +59,7 @@ function init_cmake_mutable_options {
                         -DLITE_BUILD_TAILOR=$WITH_STRIP \
                         -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
                         -DLITE_WITH_OPENCL=$WITH_OPENCL \
-                        -DLITE_WITH_INTELFPGA=$WITH_INTELFPGA \
+                        -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \
                         -DLITE_WITH_TRAIN=$WITH_TRAIN"
 }
 #####################################################################################################
@@ -290,8 +290,8 @@ function main {
                 shift
                 ;;
             # compiling lib which can operate on intel fpga.
-            --with_intelfpga=*)
-                WITH_INTELFPGA="${i#*=}"
+            --with_intel_fpga=*)
+                WITH_INTEL_FPGA="${i#*=}"
                 shift
                 ;;
             # ON or OFF, default OFF

From 9f0def80bdbadcb0892b592b5d447e88b5c7874f Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Thu, 18 Mar 2021 03:18:27 +0000
Subject: [PATCH 05/19] test=develop

---
 lite/tools/build_intel_fpga.sh | 324 ---------------------------------
 lite/tools/build_linux.sh      |   8 +
 2 files changed, 8 insertions(+), 324 deletions(-)
 delete mode 100755 lite/tools/build_intel_fpga.sh

diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh
deleted file mode 100755
index 53b22b0b085..00000000000
--- a/lite/tools/build_intel_fpga.sh
+++ /dev/null
@@ -1,324 +0,0 @@
-#!/bin/bash
-set -e
-
-#####################################################################################################
-# 1. global variables, you can change them according to your requirements
-#####################################################################################################
-# armv7hf.
-ARCH=armv7hf
-# gcc or clang, default gcc.
-TOOLCHAIN=gcc
-# ON or OFF, default OFF.
-WITH_EXTRA=ON
-# controls whether to compile python lib, default is OFF.
-WITH_PYTHON=OFF
-PY_VERSION=""
-# controls whether to compile cv functions into lib, default is OFF.
-WITH_CV=OFF
-# controls whether to print log information, default is ON.
-WITH_LOG=OFF
-# controls whether to throw the exception when error occurs, default is OFF 
-WITH_EXCEPTION=OFF
-# options of striping lib according to input model.
-WITH_STRIP=OFF
-OPTMODEL_DIR=""
-# options of compiling OPENCL lib.
-WITH_OPENCL=OFF
-# options of compiling intel fpga.
-WITH_INTEL_FPGA=ON
-# options of adding training ops
-WITH_TRAIN=OFF
-# num of threads used during compiling..
-readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
-#####################################################################################################
-
-#####################################################################################################
-# 2. local variables, these variables should not be changed.
-#####################################################################################################
-# url that stores third-party zip file to accelerate third-paty lib installation
-readonly THIRDPARTY_TAR=https://paddlelite-data.bj.bcebos.com/third_party_libs/third-party-ea5576.tar.gz
-# absolute path of Paddle-Lite.
-readonly workspace=$PWD/$(dirname $0)/../../
-# basic options for linux compiling.
-readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-                            -DLITE_WITH_ARM=ON \
-                            -DLITE_WITH_X86=OFF \
-                            -DARM_TARGET_OS=armlinux \
-                            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-                            -DWITH_TESTING=OFF"
-# mutable options for linux compiling.
-function init_cmake_mutable_options {
-    cmake_mutable_options="-DARM_TARGET_ARCH_ABI=$ARCH \
-                        -DARM_TARGET_LANG=$TOOLCHAIN \
-                        -DLITE_BUILD_EXTRA=$WITH_EXTRA \
-                        -DLITE_WITH_PYTHON=$WITH_PYTHON \
-                        -DPY_VERSION=$PY_VERSION \
-                        -DLITE_WITH_CV=$WITH_CV \
-                        -DLITE_WITH_LOG=$WITH_LOG \
-                        -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
-                        -DLITE_BUILD_TAILOR=$WITH_STRIP \
-                        -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
-                        -DLITE_WITH_OPENCL=$WITH_OPENCL \
-                        -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \
-                        -DLITE_WITH_TRAIN=$WITH_TRAIN"
-}
-#####################################################################################################
-
-####################################################################################################
-# 3. functions of prepare workspace before compiling
-####################################################################################################
-
-# 3.1 generate `__generated_code__.cc`, which is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    local root_dir=$1
-    local build_dir=$2
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code
-    mkdir -p ${GEN_CODE_PATH_PREFIX}
-    touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug
-    mkdir -p ${DEBUG_TOOL_PATH_PREFIX}
-    cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/
-}
-
-# 3.2 prepare source code of opencl lib
-# here we bundle all cl files into a cc file to bundle all opencl kernels into a single lib
-function prepare_opencl_source_code {
-    local root_dir=$1
-    local build_dir=$2
-    # in build directory
-    # Prepare opencl_kernels_source.cc file
-    GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl
-    rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
-    OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel
-    mkdir -p ${GEN_CODE_PATH_OPENCL}
-    touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
-    python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc 
-}
-
-# 3.3 prepare third_party libraries for compiling
-# here we store third_party libraries into Paddle-Lite/third-party
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-ea5576.tar.gz ]; then
-        rm -rf $workspace/third-party
-        if [ ! -f $workspace/third-party-ea5576.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-ea5576.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-####################################################################################################
-
-####################################################################################################
-# 4. compiling functions
-####################################################################################################
-
-# 4.1 function of tiny_publish compiling
-# here we only compile light_api lib
-function make_tiny_publish_so {
-    is_tiny=${1:-ON}
-    if [ "$WITH_PYTHON" = "ON" -a "$is_tiny" = "ON" ]; then
-        echo "Warning: build full_publish to use python."
-        is_tiny=OFF
-    fi
-    if [ "$WITH_TRAIN" = "ON" -a "$is_tiny" = "ON" ]; then
-        echo "Warning: build full_publish to add training ops."
-        is_tiny=OFF
-    fi
-    if [ "$BUILD_TAILOR" = "ON" -a "$OPTMODEL_DIR" = "" ]; then
-        echo "Error: set OPTMODEL_DIR if BUILD_TAILOR is ON."
-    fi
-
-    if [ "$is_tiny" = "OFF" ]; then
-        prepare_thirdparty
-    fi
-
-    build_dir=$workspace/build.lite.linux.$ARCH.$TOOLCHAIN
-    if [ "${WITH_OPENCL}" = "ON" ]; then
-       build_dir=${build_dir}.opencl
-    fi
-
-    if [ -d $build_dir ]; then
-        rm -rf $build_dir
-    fi
-    mkdir -p $build_dir
-    cd $build_dir
-
-    prepare_workspace $workspace $build_dir
-
-    if [ "${WITH_OPENCL}" = "ON" ]; then
-       prepare_opencl_source_code $workspace $build_dir
-    fi
-    if [ "${WITH_STRIP}" == "ON" ]; then
-        WITH_EXTRA=ON
-    fi
-
-    init_cmake_mutable_options
-    cmake $workspace \
-       ${CMAKE_COMMON_OPTIONS} \
-       ${cmake_mutable_options} \
-       -DLITE_ON_TINY_PUBLISH=$is_tiny
-
-    if [ "${WITH_OPENCL}" = "ON" ]; then
-       make opencl_clhpp -j$NUM_PROC 
-    fi
-
-    make publish_inference -j$NUM_PROC
-    cd - > /dev/null
-}
-####################################################################################################
-
-# 4.2 function of full_publish compiling
-# here we compile both light_api lib and full_api lib
-function make_full_publish_so {
-    make_tiny_publish_so OFF
-}
-####################################################################################################
-
-function print_usage {
-    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
-    echo -e "| Methods of compiling Padddle-Lite Linux library:                                                                                                     |"
-    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
-    echo -e "|  compile linux library: (armv8, gcc)                                                                                                                 |"
-    echo -e "|     ./lite/tools/build_linux.sh                                                                                                                      |"
-    echo -e "|  print help information:                                                                                                                             |"
-    echo -e "|     ./lite/tools/build_linux.sh help                                                                                                                 |"
-    echo -e "|                                                                                                                                                      |"
-    echo -e "|  optional argument:                                                                                                                                  |"
-    echo -e "|     --arch: (armv8|armv7hf|armv7), default is armv8                                                                                                  |"
-    echo -e "|     --toolchain: (gcc|clang), defalut is gcc                                                                                                         |"
-    echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP), default is OFF  |"
-    echo -e "|     --with_python: (OFF|ON); controls whether to build python lib or whl, default is OFF                                                             |"
-    echo -e "|     --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None                                                         |"
-    echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                                           |"
-    echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                                   |"
-    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                                            |"
-    echo -e "|                                                                                                                                                      |"
-    echo -e "|  arguments of striping lib according to input model:                                                                                                 |"
-    echo -e "|     ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                                                |"
-    echo -e "|     --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF                                                    |"
-    echo -e "|     --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library                                                  |"
-    echo -e "|  detailed information about striping lib:  https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html                           |"
-    echo -e "|                                                                                                                                                      |"
-    echo -e "|  arguments of opencl library compiling:                                                                                                              |"
-    echo -e "|     ./lite/tools/build_linux.sh --with_opencl=ON                                                                                                     |"
-    echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                                              |"
-    echo -e "|                                                                                                                                                      |"
-    echo -e "|  arguments of rockchip npu library compiling:                                                                                                        |"
-    echo -e "|     ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath                                                |"
-    echo -e "|     --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF                                                  |"
-    echo -e "|     --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library                                            |"
-    echo -e "|             you can download rockchip NPU SDK from:  https://github.com/airockchip/rknpu_ddk.git                                                     |"
-    echo -e "|  detailed information about Paddle-Lite RKNPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html                           |"
-    echo -e "|                                                                                                                                                      |"
-    echo -e "|  arguments of baidu xpu library compiling:                                                                                                           |"
-    echo -e "|     ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath                                                         |"
-    echo -e "|     --with_baidu_xpu: (OFF|ON); controls whether to compile lib for baidu_xpu, default is OFF                                                        |"
-    echo -e "|     --baidu_xpu_sdk_root: (path to baidu_xpu DDK file) required when compiling baidu_xpu library                                                     |"
-    echo "--------------------------------------------------------------------------------------------------------------------------------------------------------"
-    echo
-}
-
-function main {
-    if [ -z "$1" ]; then
-        # compiling result contains light_api lib only, recommanded.
-        make_tiny_publish_so
-        exit 0
-    fi
-
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            # armv8 or armv7hf or armv7, default armv8
-            --arch=*)
-                ARCH="${i#*=}"
-                shift
-                ;;
-            # gcc or clang, default gcc
-            --toolchain=*)
-                TOOLCHAIN="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_extra=*)
-                WITH_EXTRA="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_python=*)
-                WITH_PYTHON="${i#*=}"
-                shift
-                ;;
-            # 2.7 or 3.5 or 3.7, default is None
-            --python_version=*)
-                PY_VERSION="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_cv=*)
-                WITH_CV="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default ON
-            --with_log=*)
-                WITH_LOG="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_exception=*)
-                WITH_EXCEPTION="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_strip=*)
-                BUILD_TAILOR="${i#*=}"
-                shift
-                ;;
-            # string, absolute path to optimized model dir
-            --opt_model_dir=*)
-                OPTMODEL_DIR="${i#*=}"
-                shift
-                ;;
-            # compiling lib which can operate on opencl and cpu.
-            --with_opencl=*)
-                WITH_OPENCL="${i#*=}"
-                shift
-                ;;
-            # compiling lib which can operate on intel fpga.
-            --with_intel_fpga=*)
-                WITH_INTEL_FPGA="${i#*=}"
-                shift
-                ;;
-            # ON or OFF, default OFF
-            --with_train=*)
-                WITH_TRAIN="${i#*=}"
-                shift
-                ;;
-            # compiling result contains both light_api and cxx_api lib.
-            full_publish)
-                make_full_publish_so
-                exit 0
-                ;;
-            # print help info
-            help)
-                print_usage
-                exit 0
-                ;;
-            # unknown option
-            *)
-                echo "Error: unsupported argument \"${i#*=}\""
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-    # compiling result contains light_api lib only, recommanded.
-    make_tiny_publish_so
-}
-
-main $@
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 68745c8ca34..0857df30bbd 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -33,6 +33,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk"
 # options of compiling baidu XPU lib.
 WITH_BAIDU_XPU=OFF
 BAIDU_XPU_SDK_ROOT=""
+# options of compiling intel fpga.
+WITH_INTEL_FPGA=OFF
 # options of adding training ops
 WITH_TRAIN=OFF
 # num of threads used during compiling..
@@ -75,6 +77,7 @@ function init_cmake_mutable_options {
                         -DXPU_SDK_ROOT=$BAIDU_XPU_SDK_ROOT \
                         -DLITE_WITH_TRAIN=$WITH_TRAIN  \
                         -DLITE_WITH_IMAGINATION_NNA=$WITH_IMAGINATION_NNA \
+                        -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \
                         -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT}"
 
 }
@@ -341,6 +344,11 @@ function main {
                 BAIDU_XPU_SDK_ROOT="${i#*=}"
                 shift
                 ;;
+            # compiling lib which can operate on intel fpga.
+            --with_intel_fpga=*)
+                WITH_INTEL_FPGA="${i#*=}"
+                shift
+                ;;
             # ON or OFF, default OFF
             --with_train=*)
                 WITH_TRAIN="${i#*=}"

From a9685c4c54ea0eeaaefed2d487d60dc97db35cd4 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Mon, 22 Mar 2021 09:12:50 +0000
Subject: [PATCH 06/19] test=develop

---
 CMakeLists.txt                                |   4 +
 cmake/device/intel_fpga.cmake                 |  48 +++++
 lite/backends/intel_fpga/CMakeLists.txt       |  20 +-
 .../intel_fpga/lldrv/intelfpgadrv.cpp         | 192 ------------------
 lite/backends/intel_fpga/lldrv/intelfpgadrv.h | 186 -----------------
 lite/backends/intel_fpga/lldrv/utils.cpp      |  72 -------
 lite/backends/intel_fpga/lldrv/utils.h        |  33 ---
 lite/backends/intel_fpga/target_wrapper.cpp   |   7 +-
 lite/backends/intel_fpga/target_wrapper.h     |   1 +
 lite/kernels/intel_fpga/CMakeLists.txt        |   6 +-
 lite/tools/build_linux.sh                     |  14 +-
 11 files changed, 70 insertions(+), 513 deletions(-)
 create mode 100644 cmake/device/intel_fpga.cmake
 delete mode 100644 lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
 delete mode 100644 lite/backends/intel_fpga/lldrv/intelfpgadrv.h
 delete mode 100644 lite/backends/intel_fpga/lldrv/utils.cpp
 delete mode 100644 lite/backends/intel_fpga/lldrv/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12deaf69752..2960fb0b44d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,10 @@ if(LITE_WITH_IMAGINATION_NNA)
 	include(device/imagination_nna)
 endif()
 
+if(LITE_WITH_INTEL_FPGA)
+	include(device/intel_fpga)
+endif()
+
 # flatbuffer module for loading model
 if(LITE_UPDATE_FBS_HEAD)
     include(external/flatbuffers)
diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake
new file mode 100644
index 00000000000..498f58bfbdc
--- /dev/null
+++ b/cmake/device/intel_fpga.cmake
@@ -0,0 +1,48 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_INTEL_FPGA)
+  return()
+endif()
+
+if(NOT DEFINED INTEL_FPGA_SDK_ROOT)
+  set(INTEL_FPGA_SDK_ROOT $ENV{INTEL_FPGA_SDK_ROOT})
+  if(NOT INTEL_FPGA_SDK_ROOT)
+    message(FATAL_ERROR "Must set INTEL_FPGA_SDK_ROOT or env INTEL_FPGA_SDK_ROOT when LITE_WITH_INTEL_FPGA=ON")
+  endif()
+endif()
+
+message(STATUS "INTEL_FPGA_SDK_ROOT: ${INTEL_FPGA_SDK_ROOT}")
+
+set(INTEL_FPGA_SDK_INC  "${INTEL_FPGA_SDK_ROOT}/include")
+set(INTEL_FPGA_SDK_LIB  "${INTEL_FPGA_SDK_ROOT}/lib/libvnna.so")
+
+include_directories("${INTEL_FPGA_SDK_INC}")
+
+find_library(INTEL_FPGA_LIB_FILE NAMES vnna
+  PATHS ${INTEL_FPGA_SDK_ROOT}/lib)
+
+if(NOT INTEL_FPGA_LIB_FILE)
+  message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}")
+else()
+  message(STATUS "Found INTEL_FPGA VNNA Library: ${INTEL_FPGA_LIB_FILE}")
+  add_library(vnna SHARED IMPORTED)
+  set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_LIB_FILE})
+endif()
+
+#link_directories("${INTEL_FPGA_SDK_ROOT}/lib")
+#add_library(vnna SHARED IMPORTED GLOBAL)
+#set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB})
+
+set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs")
diff --git a/lite/backends/intel_fpga/CMakeLists.txt b/lite/backends/intel_fpga/CMakeLists.txt
index c47a33be007..24a8044d240 100644
--- a/lite/backends/intel_fpga/CMakeLists.txt
+++ b/lite/backends/intel_fpga/CMakeLists.txt
@@ -2,22 +2,4 @@ if (NOT LITE_WITH_INTEL_FPGA)
     return()
 endif()
 
-set(LITE_INTEL_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga")
-set(LITE_INTEL_FPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga/lldrv")
-
-message("intel_fpga_path ${LITE_INTEL_FPGA_PATH}")
-file(GLOB INTEL_FPGA_CPP "${LITE_INTEL_FPGA_PATH}/*.cpp")
-file(GLOB LLDRV_CPP "${LITE_INTEL_FPGA_LLDRV_PATH}/*.cpp")
-message("intel_fpga cpp: ${INTEL_FPGA_CPP}")
-set(INTEL_FPGA_ALL_CPP "")
-FOREACH(FILE_PATH ${LLDRV_CPP})
-    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
-    list(APPEND INTEL_FPGA_ALL_CPP lldrv/${FILE_NAME})
-ENDFOREACH(FILE_PATH)
-FOREACH(FILE_PATH ${INTELFPGA_CPP})
-    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
-    list(APPEND INTEL_FPGA_ALL_CPP ${FILE_NAME})
-ENDFOREACH(FILE_PATH)
-message("intel_fpga src: ${INTEL_FPGA_ALL_CPP}")
-cc_library(kernel_intel_fpga SRCS ${INTEL_FPGA_ALL_CPP})
-cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intel_fpga)
+lite_cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS ${intel_fpga_runtime_libs})
diff --git a/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
deleted file mode 100644
index cc188e483cf..00000000000
--- a/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2020 AWCloud. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <algorithm>
-#include <cstring>
-#include <map>
-#include <utility>
-
-#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
-
-namespace paddle {
-namespace lite {
-namespace intel_fpga {
-
-/// FD of intel_fpga
-static int intel_fpga_fd = -1;
-
-/// Memory blocks
-static struct intel_fpga_memblk_s mb, ms, mi, mk, mo;
-
-int intel_fpga_open() {
-  if (intel_fpga_fd < 0) {
-    intel_fpga_fd = open("/dev/intelfpgadrv0", O_RDWR);
-    if (intel_fpga_fd < 0) {
-      return -1;
-    }
-    memset(&mb, 0, sizeof(mb));
-    memset(&ms, 0, sizeof(ms));
-    memset(&mi, 0, sizeof(mi));
-    memset(&mk, 0, sizeof(mk));
-    memset(&mo, 0, sizeof(mo));
-  }
-
-  return 0;
-}
-
-void intel_fpga_close() {
-  if (intel_fpga_fd < 0) return;
-
-  if (mb.addr) {
-    free(mb.addr);
-  }
-  if (ms.addr) {
-    free(ms.addr);
-  }
-  if (mi.addr) {
-    free(mi.addr);
-  }
-  if (mk.addr) {
-    free(mk.addr);
-  }
-  if (mo.addr) {
-    free(mo.addr);
-  }
-  close(intel_fpga_fd);
-  intel_fpga_fd = -1;
-}
-
-/// memory management;
-void* intel_fpga_malloc(size_t size) { return malloc(size); }
-
-void intel_fpga_free(void* ptr) { free(ptr); }
-
-void* intel_fpga_mbias(size_t size) {
-  if (mb.addr) {
-    if (mb.size >= size) {
-      return mb.addr;
-    }
-    free(mb.addr);
-  }
-  mb.addr = malloc(size);
-  if (mb.addr) {
-    mb.size = size;
-  }
-  return mb.addr;
-}
-
-void* intel_fpga_mscale(size_t size) {
-  if (ms.addr) {
-    if (ms.size >= size) {
-      return ms.addr;
-    }
-    free(ms.addr);
-  }
-  ms.addr = malloc(size);
-  if (ms.addr) {
-    ms.size = size;
-  }
-
-  return ms.addr;
-}
-
-void* intel_fpga_minput(size_t size) {
-  if (mi.addr) {
-    if (mi.size >= size) {
-      return mi.addr;
-    }
-    free(mi.addr);
-  }
-  mi.addr = malloc(size);
-  if (mi.addr) {
-    mi.size = size;
-  }
-
-  return mi.addr;
-}
-
-void* intel_fpga_mkernel(size_t size) {
-  if (mk.addr) {
-    if (mk.size >= size) {
-      return mk.addr;
-    }
-    free(mk.addr);
-  }
-  mk.addr = malloc(size);
-  if (mk.addr) {
-    mk.size = size;
-  }
-
-  return mk.addr;
-}
-
-void* intel_fpga_moutput(size_t size) {
-  if (mo.addr) {
-    if (mo.size >= size) {
-      return mo.addr;
-    }
-    free(mo.addr);
-  }
-  mo.addr = malloc(size);
-  if (mo.addr) {
-    mo.size = size;
-  }
-
-  return mo.addr;
-}
-
-void intel_fpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); }
-
-int intel_fpga_info(struct intel_fpga_info_s* args) {
-  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_INFO);
-
-  if (intel_fpga_open()) return -1;
-
-  return ioctl(intel_fpga_fd, cmd, args);
-}
-
-int intel_fpga_conv(struct intel_fpga_conv_s* args) {
-  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_CONV);
-
-  if (intel_fpga_open()) return -1;
-
-  return ioctl(intel_fpga_fd, cmd, args);
-}
-
-int intel_fpga_pooling(struct intel_fpga_pool_s* args) {
-  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_POOL);
-
-  if (intel_fpga_open()) return -1;
-
-  return ioctl(intel_fpga_fd, cmd, args);
-}
-
-int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args) {
-  int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_FCON);
-
-  if (intel_fpga_open()) return -1;
-
-  return ioctl(intel_fpga_fd, cmd, args);
-}
-
-}  // namespace intel_fpga
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/intel_fpga/lldrv/intelfpgadrv.h b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h
deleted file mode 100644
index 0a162e7af9d..00000000000
--- a/lite/backends/intel_fpga/lldrv/intelfpgadrv.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2020 AWCloud. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _LLDRV_INTEL_FPGA_H_
-#define _LLDRV_INTEL_FPGA_H_
-
-#pragma once
-
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-
-namespace paddle {
-namespace lite {
-namespace intel_fpga {
-
-// Activation type
-enum intel_fpga_act_e {
-  ACT_NONE = 0,
-  ACT_RELU = 1,
-};
-
-// Device information
-struct intel_fpga_info_s {
-  uint32_t ver;  // Version, 00.00.0000
-};
-
-struct intel_fpga_reset_s {
-  uint32_t val;  // reset command, N/A
-};
-
-// Memory copy
-struct intel_fpga_mcopy_s {
-  void* src;    // source address
-  void* dst;    // destination adddress
-  size_t size;  // size in bytes
-};
-
-// Memory block
-struct intel_fpga_memblk_s {
-  void* addr;   // base address
-  size_t size;  // size in bytes
-};
-
-// Kernel
-struct intel_fpga_kernel_s {
-  uint32_t kw;  // width
-  uint32_t kh;  // height
-  uint32_t ws;  // width stride(s)
-  uint32_t hs;  // height stride(s)
-};
-
-// Input parameters, nchw
-struct intel_fpga_input_s {
-  uint32_t in;  // nbr of batch {1}
-  uint32_t ic;  // nbr of channels {1}
-  uint32_t iw;  // width
-  uint32_t ih;  // height
-  uint32_t pl;  // padding x in bytes {0}
-  uint32_t pr;  // padding x in bytes {0}
-  uint32_t pt;  // padding y in bytes {0}
-  uint32_t pb;  // padding y in bytes {0}
-  uint32_t dx;  // dilation for x {1}
-  uint32_t dy;  // dilation for y {1}
-};
-
-// Output parameters, nchw
-struct intel_fpga_output_s {
-  uint32_t on;  // nbr of batch {1}
-  uint32_t oc;  // nbr of channels {1}
-  uint32_t ow;  // width
-  uint32_t oh;  // height
-};
-
-// Basic convolution
-struct intel_fpga_conv_s {
-  uint32_t at;                   // activation type {0}, None=0, RELU=1
-  uint32_t ng;                   // nbr of groups {1}
-  int8_t* ia;                    // input address, INT8[N,Ci,Hi,Wi]
-  int8_t* ka;                    // kernel address, INT32[Co,Ci,Hk,Wk]
-  int32_t* ba;                   // bias address, INT32[Co,1]
-  int32_t* oa;                   // output address, INT32[N,Co,Ho,Wo]
-  struct intel_fpga_input_s i;   // input
-  struct intel_fpga_kernel_s k;  // kernel
-  struct intel_fpga_output_s o;  // output
-};
-
-// Pooling convolution
-struct intel_fpga_pool_s {
-  uint32_t gp : 1;         // global pooling {0}
-  uint32_t pm : 1;         // pooling mode {0}, Max=0, AVG=1
-  uint32_t cm : 1;         // ceil mode {0}, ceil=0, floor=1
-  uint32_t ex : 1;         // exclusive {1}, if ignore padding in avg pooling
-  uint32_t reserved : 28;  // reserved {0}
-  int32_t* ia;             // input address, INT32[N,Ci,Hi,Wi]
-  int32_t* oa;             // output address, INT32[N,Ci,Ho,Wo]
-  struct intel_fpga_input_s i;   // input
-  struct intel_fpga_kernel_s k;  // kernel
-  struct intel_fpga_output_s o;  // output
-};
-
-// Full connection
-struct intel_fpga_fcon_s {
-  uint32_t at;  // activation type {0}, None=0, RELU=1
-  int8_t* ia;   // input address, INT8[M,K]
-  int8_t* ka;   // kernel address, INT8[K,N]
-  int32_t* ba;  // bias address, INT32[M,N]
-  int32_t* oa;  // output address, INT32[M,N] = ia[M,K] * wa[K,N] + ba[M,N]
-  int m, n, k;  // dims
-};
-
-// Regisger access
-struct intel_fpga_creg_s {
-  uint32_t addr;
-  uint32_t data;
-};
-
-#define INTEL_FPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4)
-
-/* Ioctls */
-#define INTEL_FPGA_IOCTL_MAKE(cmd) (_IO(INTEL_FPGA_MAGIC_ID, cmd))
-#define INTEL_FPGA_IOCTL_GET(cmd) (_IOC_NR(cmd))
-#define INTEL_FPGA_IOCTL_VALID(cmd) \
-  ((_IOC_TYPE(cmd) == INTEL_FPGA_MAGIC_ID) ? 1 : 0)
-
-#define INTEL_FPGA_CMD_INFO 0x00   // struct intel_fpga_info_s
-#define INTEL_FPGA_CMD_RESET 0x01  // struct intel_fpga_reset_s
-
-#define INTEL_FPGA_CMD_MCOPY 0x10  // struct intel_fpga_mcopy_s
-#define INTEL_FPGA_CMD_INVAL 0x11  // struct intel_fpga_cache_s
-#define INTEL_FPGA_CMD_FLUSH 0x12  // struct intel_fpga_cache_s
-
-#define INTEL_FPGA_CMD_CONV 0x20  // struct intel_fpga_conv_s
-#define INTEL_FPGA_CMD_POOL 0x21  // struct intel_fpga_pool_s
-#define INTEL_FPGA_CMD_FCON 0x22  // struct intel_fpga_fcon_s
-
-#define INTEL_FPGA_CMD_REGRD 0xC0  // struct intel_fpga_register_s
-#define INTEL_FPGA_CMD_REGWR 0xC1  // struct intel_fpga_register_s
-
-//---------------------------------------------------------------------------
-
-// device open/close
-int intel_fpga_open();
-void intel_fpga_close();
-
-void intel_fpga_reset(struct intel_fpga_reset_s* args);
-
-// memory management
-void* intel_fpga_malloc(size_t size);
-void intel_fpga_free(void* ptr);
-
-void* intel_fpga_mbias(size_t size);
-void* intel_fpga_mscale(size_t size);
-void* intel_fpga_minput(size_t size);
-void* intel_fpga_mkernel(size_t size);
-void* intel_fpga_moutput(size_t size);
-
-void intel_fpga_copy(void* dst, void* src, int size);
-int intel_fpga_flush(void* addr, size_t size);
-int intel_fpga_invalidate(void* addr, size_t size);
-
-// device information
-int intel_fpga_info(struct intel_fpga_info_s* args);
-
-// convolution process
-int intel_fpga_conv(struct intel_fpga_conv_s* args);
-int intel_fpga_pooling(struct intel_fpga_pool_s* args);
-int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args);
-
-}  // namespace intel_fpga
-}  // namespace lite
-}  // namespace paddle
-
-#endif  // _LLDRV_INTEL_FPGA_H_
diff --git a/lite/backends/intel_fpga/lldrv/utils.cpp b/lite/backends/intel_fpga/lldrv/utils.cpp
deleted file mode 100644
index 380e79e4d31..00000000000
--- a/lite/backends/intel_fpga/lldrv/utils.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2020 AWCloud. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-#include <algorithm>
-#include <fstream>
-#include <string>
-
-#include "lite/backends/intel_fpga/lldrv/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace intel_fpga {
-
-float find_max(const float* data, int size) {
-  float max = 0.0;
-
-  for (size_t i = 0; i < size; ++i) {
-    float value = data[i];
-    float abs = value > 0.0 ? value : -value;
-
-    max = std::max(max, abs);
-  }
-
-  return max;
-}
-
-void quantize_s8(const float* src, int8_t* dst, int size, float factor) {
-  float fdata;
-
-  for (size_t i = 0; i < size; i++) {
-    fdata = src[i] * factor;
-
-    if (fdata < 0.0) {
-      fdata -= 0.5;
-    } else {
-      fdata += 0.5;
-    }
-
-    dst[i] = (int8_t)fdata;
-  }
-}
-
-void quantize_s32(const float* src, int32_t* dst, int size, float factor) {
-  float fdata;
-
-  for (size_t i = 0; i < size; i++) {
-    fdata = src[i] * factor;
-
-    if (fdata < 0.0) {
-      fdata -= 0.5;
-    } else {
-      fdata += 0.5;
-    }
-
-    dst[i] = (int32_t)fdata;
-  }
-}
-}  // namespace intel_fpga
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/intel_fpga/lldrv/utils.h b/lite/backends/intel_fpga/lldrv/utils.h
deleted file mode 100644
index ad8e403afd8..00000000000
--- a/lite/backends/intel_fpga/lldrv/utils.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2020 AWCloud. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-#include <cwchar>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace intel_fpga {
-
-float find_max(const float* data, int size);
-
-void quantize_s8(const float* src, int8_t* dst, int size, float factor);
-void quantize_s32(const float* src, int32_t* dst, int size, float factor);
-
-}  // namespace intel_fpga
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/intel_fpga/target_wrapper.cpp b/lite/backends/intel_fpga/target_wrapper.cpp
index 0d567016c91..89d6bee61d5 100644
--- a/lite/backends/intel_fpga/target_wrapper.cpp
+++ b/lite/backends/intel_fpga/target_wrapper.cpp
@@ -13,19 +13,16 @@
 // limitations under the License.
 
 #include "lite/backends/intel_fpga/target_wrapper.h"
-#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
 namespace lite {
 
 void* TargetWrapper<TARGET(kIntelFPGA)>::Malloc(size_t size) {
-  return intel_fpga::intel_fpga_malloc(size);
+  return intelfpga_malloc(size);
 }
 
-void TargetWrapper<TARGET(kIntelFPGA)>::Free(void* ptr) {
-  intel_fpga::intel_fpga_free(ptr);
-}
+void TargetWrapper<TARGET(kIntelFPGA)>::Free(void* ptr) { intelfpga_free(ptr); }
 
 void TargetWrapper<TARGET(kIntelFPGA)>::MemcpySync(void* dst,
                                                    const void* src,
diff --git a/lite/backends/intel_fpga/target_wrapper.h b/lite/backends/intel_fpga/target_wrapper.h
index ee60348f10f..e91bc7c5f6e 100644
--- a/lite/backends/intel_fpga/target_wrapper.h
+++ b/lite/backends/intel_fpga/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <map>
+#include "intelfpga.h"  // NOLINT
 #include "lite/core/target_wrapper.h"
 
 namespace paddle {
diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt
index 276f4cb7e54..f7747dddeb6 100755
--- a/lite/kernels/intel_fpga/CMakeLists.txt
+++ b/lite/kernels/intel_fpga/CMakeLists.txt
@@ -2,8 +2,10 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_W
     return()
 endif()
 
-set(intel_fpga_deps intel_fpga_target_wrapper kernel_intel_fpga)
+set(intel_fpga_deps intel_fpga_target_wrapper)
+
+#lite_cc_library(kernel_intel_fpga_vnna SRCS conv_depthwise.cc conv_gemmlike.cc DEPS ${intel_fpga_runtime_libs})
 
 add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps})
-add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps})
+add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps} ${intel_fpga_runtime_libs})
 add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga)
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 0857df30bbd..0a62f90793a 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -5,11 +5,11 @@ set -e
 # 1. global variables, you can change them according to your requirements
 #####################################################################################################
 # armv8 or armv7hf or armv7, default armv8.
-ARCH=armv8
+ARCH=armv7hf
 # gcc or clang, default gcc.
 TOOLCHAIN=gcc
 # ON or OFF, default OFF.
-WITH_EXTRA=OFF
+WITH_EXTRA=ON
 # controls whether to compile python lib, default is OFF.
 WITH_PYTHON=OFF
 PY_VERSION=""
@@ -34,7 +34,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk"
 WITH_BAIDU_XPU=OFF
 BAIDU_XPU_SDK_ROOT=""
 # options of compiling intel fpga.
-WITH_INTEL_FPGA=OFF
+WITH_INTEL_FPGA=ON
+INTEL_FPGA_SDK_ROOT="$(pwd)/intelfpga_sdk" 
 # options of adding training ops
 WITH_TRAIN=OFF
 # num of threads used during compiling..
@@ -77,8 +78,9 @@ function init_cmake_mutable_options {
                         -DXPU_SDK_ROOT=$BAIDU_XPU_SDK_ROOT \
                         -DLITE_WITH_TRAIN=$WITH_TRAIN  \
                         -DLITE_WITH_IMAGINATION_NNA=$WITH_IMAGINATION_NNA \
+                        -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT} \
                         -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \
-                        -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT}"
+                        -DINTEL_FPGA_SDK_ROOT=${INTEL_FPGA_SDK_ROOT}"
 
 }
 #####################################################################################################
@@ -349,6 +351,10 @@ function main {
                 WITH_INTEL_FPGA="${i#*=}"
                 shift
                 ;;
+            --intel_fpga_sdk_root=*)
+                INTEL_FPGA_SDK_ROOT="${i#*=}"
+                shift
+                ;;
             # ON or OFF, default OFF
             --with_train=*)
                 WITH_TRAIN="${i#*=}"

From 4c3196ef6deebeb696ef6b536fb3a1b9ad951fb8 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Tue, 23 Mar 2021 12:42:10 +0000
Subject: [PATCH 07/19] test=develop

---
 cmake/device/intel_fpga.cmake            |  23 +++--
 lite/CMakeLists.txt                      |   4 +
 lite/api/CMakeLists.txt                  |  11 ++-
 lite/backends/CMakeLists.txt             |   2 +-
 lite/kernels/intel_fpga/CMakeLists.txt   |  10 +-
 lite/kernels/intel_fpga/conv_gemmlike.cc | 119 ++++++++---------------
 lite/kernels/intel_fpga/conv_gemmlike.h  |   1 +
 7 files changed, 71 insertions(+), 99 deletions(-)

diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake
index 498f58bfbdc..afde0d8796c 100644
--- a/cmake/device/intel_fpga.cmake
+++ b/cmake/device/intel_fpga.cmake
@@ -25,24 +25,23 @@ endif()
 
 message(STATUS "INTEL_FPGA_SDK_ROOT: ${INTEL_FPGA_SDK_ROOT}")
 
-set(INTEL_FPGA_SDK_INC  "${INTEL_FPGA_SDK_ROOT}/include")
-set(INTEL_FPGA_SDK_LIB  "${INTEL_FPGA_SDK_ROOT}/lib/libvnna.so")
+find_path(INTEL_FPGA_SDK_INC NAMES intelfpga.h
+  PATHS ${INTEL_FPGA_SDK_ROOT}/include NO_DEFAULT_PATH)
+if (NOT INTEL_FPGA_SDK_INC)
+  message(FATAL_ERROR "Can not find intelfpga.h in ${INTEL_FPGA_SDK_INC}/include")
+endif()
 
 include_directories("${INTEL_FPGA_SDK_INC}")
 
-find_library(INTEL_FPGA_LIB_FILE NAMES vnna
+find_library(INTEL_FPGA_SDK_LIB NAMES vnna
   PATHS ${INTEL_FPGA_SDK_ROOT}/lib)
 
-if(NOT INTEL_FPGA_LIB_FILE)
-  message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}")
+if(NOT INTEL_FPGA_SDK_LIB)
+  message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib")
 else()
-  message(STATUS "Found INTEL_FPGA VNNA Library: ${INTEL_FPGA_LIB_FILE}")
-  add_library(vnna SHARED IMPORTED)
-  set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_LIB_FILE})
+  message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}")
+  add_library(vnna SHARED IMPORTED GLOBAL)
+  set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB})
 endif()
 
-#link_directories("${INTEL_FPGA_SDK_ROOT}/lib")
-#add_library(vnna SHARED IMPORTED GLOBAL)
-#set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB})
-
 set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 732cd2d29d3..4a4f8af5848 100755
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -11,6 +11,7 @@ message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_INTEL_FPGA:\t${LITE_WITH_INTEL_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
@@ -133,6 +134,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (LITE_WITH_IMAGINATION_NNA)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna")
     endif(LITE_WITH_IMAGINATION_NNA)
+    if (LITE_WITH_INTEL_FPGA)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intelfpga")
+    endif(LITE_WITH_INTEL_FPGA)
 else()
     set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 64b68cc0c02..ee2a40cb691 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -37,7 +37,6 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     if(LITE_WITH_CV)
       target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive")
     endif(LITE_WITH_CV)
-
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
                   DEPS ${light_lib_DEPS}
@@ -47,6 +46,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                   APU_DEPS ${apu_kernels}
                   RKNPU_DEPS ${rknpu_kernels}
                   IMAGINATION_NNA_DEPS ${imagination_nna_kernels}
+                  INTEL_FPGA_DEPS ${intel_fpga_kernels}
                   HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                   )
 
@@ -177,10 +177,6 @@ if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
     set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()
-if(LITE_WITH_INTEL_FPGA)
-    set(light_api_deps ${light_api_deps} ${intel_fpga_deps})
-    set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps})
-endif()
 if(LITE_WITH_BM)
     set(light_api_deps ${light_api_deps} ${bm_deps})
     set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
@@ -196,6 +192,11 @@ if(LITE_WITH_IMAGINATION_NNA)
     set(cxx_api_deps ${cxx_api_deps} ${imagination_nna_deps})
 endif()
 
+if(LITE_WITH_INTEL_FPGA)
+    set(light_api_deps ${light_api_deps} ${intel_fpga_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps})
+endif()
+
 if(LITE_WITH_HUAWEI_ASCEND_NPU)
     set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
     set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 7c05e6138f1..c15d07c0904 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -12,4 +12,4 @@ add_subdirectory(apu)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
-add_subdirectory(intel_fpga)
+#add_subdirectory(intel_fpga)
diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt
index f7747dddeb6..748a6f08c9d 100755
--- a/lite/kernels/intel_fpga/CMakeLists.txt
+++ b/lite/kernels/intel_fpga/CMakeLists.txt
@@ -2,10 +2,12 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_W
     return()
 endif()
 
-set(intel_fpga_deps intel_fpga_target_wrapper)
+set(intel_fpga_deps ${lite_kernel_deps} ${intel_fpga_runtime_libs})
 
-#lite_cc_library(kernel_intel_fpga_vnna SRCS conv_depthwise.cc conv_gemmlike.cc DEPS ${intel_fpga_runtime_libs})
+#lite_cc_library(dwconv_intel_fpga SRCS conv_depthwise.cc DEPS ${lite_kernel_deps})
+#lite_cc_library(gmconv_intel_fpga SRCS conv_gemmlike.cc DEPS ${lite_kernel_deps})
+#set(conv_intel_fpga ${intel_fpga_runtime_libs} dwconv_intel_fpga gmconv_intel_fpga)
 
 add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps})
-add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps} ${intel_fpga_runtime_libs})
-add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga)
+add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps})
+add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} ${conv_depthwise_intel_fpga} ${conv_gemmlike_intel_fpga})
diff --git a/lite/kernels/intel_fpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc
index bc9b6f68014..849dabc3dcf 100644
--- a/lite/kernels/intel_fpga/conv_gemmlike.cc
+++ b/lite/kernels/intel_fpga/conv_gemmlike.cc
@@ -16,8 +16,6 @@
 #include <vector>
 #include "lite/backends/arm/math/gemm_prepacked_int8.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
-#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h"
-#include "lite/backends/intel_fpga/lldrv/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -67,84 +65,51 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
 
   if (kh > 1 && kw > 1) {
     int i, j, il, kl, ol, l, m, n, k;
-    lite::intel_fpga::intel_fpga_conv_s conv;
+    intelfpga_conv2d_s conv;
 
     conv.at = static_cast<uint32_t>(param.activation_param.active_type);
-    if (conv.at == 4) {
-      alpha = param.activation_param.Leaky_relu_alpha;
-    }
     conv.ng = param.groups;
-
-    conv.i.in = i_dims[0];
-    conv.i.ic = i_dims[1];
-    conv.i.ih = i_dims[2];
-    conv.i.iw = i_dims[3];
-    conv.i.pl = paddings[2];  // left
-    conv.i.pr = paddings[3];  // right
-    conv.i.pt = paddings[0];  // top
-    conv.i.pb = paddings[1];  // bottom
-    conv.i.dy = dilations[0];
-    conv.i.dx = dilations[1];
-
-    conv.k.kh = w_dims[2];
-    conv.k.kw = w_dims[3];
-    conv.k.hs = param.strides[0];
-    conv.k.ws = param.strides[1];
-
-    conv.o.on = o_dims[0];
-    conv.o.oc = o_dims[1];
-    conv.o.oh = o_dims[2];
-    conv.o.ow = o_dims[3];
-
-    il = conv.i.in * conv.i.ic * conv.i.ih * conv.i.iw;
-    kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw;
-    ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow;
-    conv.ia = static_cast<int8_t*>(
-        lite::intel_fpga::intel_fpga_minput(il * sizeof(int8_t)));
-    conv.ka = static_cast<int8_t*>(
-        lite::intel_fpga::intel_fpga_mkernel(kl * sizeof(int8_t)));
-    conv.oa = static_cast<int32_t*>(
-        lite::intel_fpga::intel_fpga_moutput(ol * sizeof(int32_t)));
-    if (conv.ia && conv.ka && conv.oa) {
-      float fd = lite::intel_fpga::find_max(i_data, il);
-      float fw = lite::intel_fpga::find_max(w_data, kl);
-
-      fd = 127.0 / fd;
-      fw = 127.0 / fw;
-
-      // y = 127.0 / fmax
-      // y = x * scale;
-      lite::intel_fpga::quantize_s8(i_data, conv.ia, il, fd);
-      lite::intel_fpga::quantize_s8(w_data, conv.ka, kl, fw);
-
-      // perform conv2d
-      if (lite::intel_fpga::intel_fpga_conv(&conv)) {
-        std::cout << "intel_fpga_conv error" << std::endl;
-      }
-      // Convert int32 back to fp32, [n,c,h,w]
-      // 1. y = x / scale
-      // 2. y = x + b
-      // 3. y = f(x)
-      int hw = conv.o.oh * conv.o.ow;
-      for (i = 0; i < conv.o.on; i++) {
-        for (j = 0; j < conv.o.oc; j++) {
-          m = i * conv.o.oc + j;
-          n = m * hw;
-          for (l = 0; l < hw; l++) {
-            k = n + l;
-            o_data[k] = static_cast<float>(conv.oa[k] / fd / fw);
-            if (b_data) o_data[k] += b_data[j];
-            if (conv.at == 1) {  // relu
-              o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0;
-            } else if (conv.at == 2) {  // relu6
-              o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0;
-              o_data[k] = o_data[k] > 6.0 ? 6.0 : o_data[k];
-            } else if (conv.at == 4) {  // leakyRelu
-              if (o_data[k] < 0.0) o_data[k] = o_data[k] * alpha;
-            }
-          }
-        }
-      }
+    switch (conv.at) {
+      case 1:
+        conv.at = INTELFPGA_ACT_RELU;
+        break;
+      case 2:
+        conv.at = INTELFPGA_ACT_RELU6;
+        break;
+      case 4:
+        conv.at = INTELFPGA_ACT_LEAKYRELU;
+        conv.alpha = param.activation_param.Leaky_relu_alpha;
+        break;
+      default:
+        conv.at = INTELFPGA_ACT_NONE;
+        break;
+    }
+    conv.ia = const_cast<float*>(i_data);
+    conv.ka = const_cast<float*>(w_data);
+    conv.ba = const_cast<float*>(b_data);
+    conv.oa = const_cast<float*>(o_data);
+    conv.ip.in = i_dims[0];
+    conv.ip.ic = i_dims[1];
+    conv.ip.ih = i_dims[2];
+    conv.ip.iw = i_dims[3];
+    conv.ip.pl = paddings[2];  // left
+    conv.ip.pr = paddings[3];  // right
+    conv.ip.pt = paddings[0];  // top
+    conv.ip.pb = paddings[1];  // bottom
+    conv.ip.dy = dilations[0];
+    conv.ip.dx = dilations[1];
+
+    conv.kp.kh = w_dims[2];
+    conv.kp.kw = w_dims[3];
+    conv.kp.hs = param.strides[0];
+    conv.kp.ws = param.strides[1];
+
+    conv.op.on = o_dims[0];
+    conv.op.oc = o_dims[1];
+    conv.op.oh = o_dims[2];
+    conv.op.ow = o_dims[3];
+    if (intelfpga_conv2d(&conv)) {
+      std::cout << "intel_fpga_conv error" << std::endl;
     }
   } else {
     if (flag_1x1gemm_) {
diff --git a/lite/kernels/intel_fpga/conv_gemmlike.h b/lite/kernels/intel_fpga/conv_gemmlike.h
index 338a711983c..bad897c3800 100644
--- a/lite/kernels/intel_fpga/conv_gemmlike.h
+++ b/lite/kernels/intel_fpga/conv_gemmlike.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <intelfpga.h>
 #include <cmath>
 #include <string>
 #include <vector>

From 1428da369c39525b9e6aa8d203149ca392f9aa49 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 24 Mar 2021 03:35:30 +0000
Subject: [PATCH 08/19] test=develop

---
 cmake/device/intel_fpga.cmake          |  3 +--
 lite/api/CMakeLists.txt                | 16 ++++++++++++++++
 lite/backends/CMakeLists.txt           |  2 +-
 lite/kernels/intel_fpga/CMakeLists.txt |  6 +-----
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake
index afde0d8796c..e753b88a5ca 100644
--- a/cmake/device/intel_fpga.cmake
+++ b/cmake/device/intel_fpga.cmake
@@ -40,8 +40,7 @@ if(NOT INTEL_FPGA_SDK_LIB)
   message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib")
 else()
   message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}")
-  add_library(vnna SHARED IMPORTED GLOBAL)
-  set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB})
+  link_directories(${INTEL_FPGA_SDK_ROOT}/lib)
 endif()
 
 set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs")
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index ee2a40cb691..a061fa0234d 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -37,6 +37,10 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     if(LITE_WITH_CV)
       target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive")
     endif(LITE_WITH_CV)
+    if (LITE_WITH_INTEL_FPGA)
+      # Need to add INTEL_FPGA runtime libs  dependency
+      target_link_libraries(paddle_full_api_shared ${intel_fpga_runtime_libs})
+    endif(LITE_WITH_INTEL_FPGA)
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
                   DEPS ${light_lib_DEPS}
@@ -138,6 +142,10 @@ else()
             #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs})
         endif()
 
+        if (LITE_WITH_INTEL_FPGA)
+            # Need to add INTEL_FPGA runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${intel_fpga_runtime_libs})
+        endif(LITE_WITH_INTEL_FPGA)
         # 3. produce java lib from `PADDLELITE_OBJS` if LITE_WITH_JAVA=ON
         if (LITE_WITH_JAVA)
           add_library(paddle_lite_jni SHARED $<TARGET_OBJECTS:PADDLELITE_OBJS> android/jni/native/paddle_lite_jni.cc android/jni/native/tensor_jni.cc)
@@ -467,8 +475,16 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
+    if (LITE_WITH_INTEL_FPGA)
+      # Need to add INTEL_FPGA runtime libs dependency
+      target_link_libraries(paddle_api_full ${intel_fpga_runtime_libs})
+    endif()
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
     bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
+    if (LITE_WITH_INTEL_FPGA)
+      # Need to add INTEL_FPGA runtime libs dependency
+      target_link_libraries(paddle_api_light ${intel_fpga_runtime_libs})
+    endif()
 endif()
 
 #-----------------------------------------------------------------------------------------------------
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index c15d07c0904..7c05e6138f1 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -12,4 +12,4 @@ add_subdirectory(apu)
 add_subdirectory(rknpu)
 add_subdirectory(huawei_ascend_npu)
 add_subdirectory(imagination_nna)
-#add_subdirectory(intel_fpga)
+add_subdirectory(intel_fpga)
diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt
index 748a6f08c9d..7f1979b4543 100755
--- a/lite/kernels/intel_fpga/CMakeLists.txt
+++ b/lite/kernels/intel_fpga/CMakeLists.txt
@@ -4,10 +4,6 @@ endif()
 
 set(intel_fpga_deps ${lite_kernel_deps} ${intel_fpga_runtime_libs})
 
-#lite_cc_library(dwconv_intel_fpga SRCS conv_depthwise.cc DEPS ${lite_kernel_deps})
-#lite_cc_library(gmconv_intel_fpga SRCS conv_gemmlike.cc DEPS ${lite_kernel_deps})
-#set(conv_intel_fpga ${intel_fpga_runtime_libs} dwconv_intel_fpga gmconv_intel_fpga)
-
 add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps})
 add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps})
-add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} ${conv_depthwise_intel_fpga} ${conv_gemmlike_intel_fpga})
+add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga)

From 4f11189b99c59ae3114eefa45cde2013c7103fa8 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 24 Mar 2021 05:18:36 +0000
Subject: [PATCH 09/19] test=develop

---
 docs/demo_guides/intel_fpga.md | 234 ++++++++++++++++++++-------------
 1 file changed, 141 insertions(+), 93 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index b76920bd134..2d4dc721e53 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -1,107 +1,155 @@
-# PaddleLite使用IntelFPGA预测部署
+# PaddleLite使用英特尔FPGA预测部署
 
-Paddle Lite支持基于arm的IntelFPGA C5的模型预测，提供armv7hf的交叉编译
+PaddleLite已支持英特尔FPGA平台的预测部署，PaddleLite通过调用底层驱动实现对FPGA硬件的调度。
 
-PaddleLite通过调用底层驱动实现对FPGA硬件的调度，以及对应的API接口。
+## PaddleLite实现英特尔FPGA简介
 
-## Lite实现IntelFPGA简介
+PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特性如下：
 
-Lite支持IntelFPGA作为后端硬件进行模型推理，其主要特性如下：
+- PaddleLite中英特尔FPGA的kernel，weights和bias仍为FP32、NCHW的格式，在提升计算速度的同时能做到用户对数据格式无感知
+- 对于英特尔FPGA暂不支持的kernel，均会切回arm端运行，实现arm+FPGA混合布署运行
+- 目前英特尔FPGA成本功耗都较低，可作为边缘设备首选硬件
 
-- Lite中IntelFPGA的kernel均以FP32、NCHW的格式作为输入输出格式
+## 支持现状
 
-- 对于IntelFPGA暂不支持的kernel，均会切回ARM端运行，实现ARM+FPGA混合布署运行
+### 已支持的芯片
 
-## 支持芯片
-- [Cyclone V](https://www.intel.cn/content/dam/altera-www/global/en_US/pdfs/literature/hb/cyclone-v/cv_51002.pdf)
+- 英特尔FPGA Cyclone V系列芯片
 
-### 已支持（或部分支持）的Paddle算子
+### 已支持的设备
 
-- relu/relu6/leakyrelu
-- conv2d
-- depthwise_conv2d
+- 海运捷讯C5MB开发板
 
 ### 已支持的Paddle模型
 
-- [SSD_MobileNet_V1](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_coco_pretrained.tar)
-
-## 编译
-
-需要提前准备带有intelfpgadrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码
-
-CMAKE编译选项：
-
-- 设置`LITE_WITH_INTEL_FPGA=ON`和`LITE_WITH_ARM=ON`
-
-其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。
-
-示例如下：
-```shell
-    cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DLITE_WITH_OPENMP=ON   \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=OFF \
-        -DLITE_WITH_INTEL_FPGA=ON \
-        -DARM_TARGET_OS=armlinux 
-    make publish_inference -j2
-```
-Lite提供IntelFPGA编译脚本，位于lite/tools/build_intel_fpga.sh full_publish，在Lite根目录执行该脚本即可编译
-
-## 运行示例
-
-- **运行文件准备**
-
-下面以SSD模型为例，介绍如何使用C5MB/C5TB开发板实现模型运行
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/ssd_mobilenet_v1.tar.gz)
 
-```bash
-#打开串口调试工具，如Putty或SecureCRT，选择对应的调试串口，并设置串口属性，
-#波特率：115200，数据位：8，停止位：1，奇偶校验：无[主机上执行]
-#上电C5MB开发板，并在串口调试工具中登录
-awcloud login: root
-Password: #密码：Awcloud@123
-#进入/opt目录[开发板执行]
-cd /opt
-#在运行模型前需要加载FPGA驱动[开发板执行]
-insmod driver/intelfpgadrv.ko
-```
-
-- **使用IntelFPGA进行模型预测**
-
-```bash
-#以下命令均在开发板上运行，在开发板上已经部署了对应的输入图片，模型，驱动程序，执行程序等
-#运行SSD测试程序，输入图片为/opt/images/dog.jpg，输出图片为/opt/dog_result.jpg
-./run_ssd.sh
-```
-
-## 如何在Code中使用
-
-在Lite中使用IntelFPGA与ARM相似，具体的区别如下：
-
-- 由于IntelFPGA运行模式为FP32精度、NCHW布局，所以需要修改相应的`valid_place`
-
-代码示例：
-```cpp
-lite::Predictor predictor;
-std::vector<Place> valid_places(
-      {Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)},Place{TARGET(kARM)});
-
-predictor.Build(model_dir, "", "", valid_places);
-
-auto* input_tensor = predictor.GetInput(0);
-input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-auto* data = input_tensor->mutable_data<float>();
-auto item_size = input_tensor->dims().production();
-//假设设置输入数据全为1
-for (int i = 0; i < item_size; i++) {
-  data[i] = 1;
-}
+### 已支持（或部分支持）的Paddle算子
 
-predictor.Run();
-auto* out = predictor.GetOutput(0);
-```
+- relu/relu6/leakyrelu
+- conv2d
+- depthwise_conv2d
+- pool2d
+- fc
+
+## 准备工作
+
+开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/AIGO_C5MB_UG.pdf)
+
+## 参考示例演示
+
+### 测试设备(Roc1开发板)
+
+![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_front.jpg)
+
+![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_back.jpg)
+
+### 准备设备环境
+
+- 提前准备带有intelfpgadrv.ko的英特尔FPGA开发板（如C5MB）；
+- 确定能够通过SSH方式远程登录C5MB开发板；
+- 由于C5MB的ARM能力较弱，示例程序和PaddleLite库的编译均采用交叉编译方式。
+
+### 准备交叉编译环境
+
+- 按照以下两种方式配置交叉编译环境：
+  - Docker交叉编译环境：由于C5MB运行环境为Ubuntu16.04，因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image，而需要按照如下方式在Host机器上手动构建Ubuntu16.04的docker image；
+
+    ```
+    $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile
+    $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 .
+    $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash
+    ```
+
+- Ubuntu交叉编译环境：要求Host为Ubuntu16.04系统，参考[编译环境准备](../source_compile/compile_env)中的"交叉编译ARM Linux"步骤安装交叉编译工具链。
+- 由于需要通过scp和ssh命令将交叉编译生成的PaddleLite库和示例程序传输到设备上执行，因此，在进入Docker容器后还需要安装如下软件：
+
+  ```
+  # apt-get install openssh-client sshpass
+  ```
+
+### 运行图像检测示例程序
+
+- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/PaddleLite-linux-demo.tar.gz)，解压后清单如下：
+
+  ```shell
+  - PaddleLite-linux-demo
+    - ssd_detection
+      - assets
+        - images 
+          - dog.jpg # 测试图片
+          - dog.raw # 已处理成raw数据的测试图片
+        - labels
+          - pascalvoc_label_list # 检测label文件
+        - models
+          - ssd_mobilenet_v1 # Non-combined格式的、SSD量化模型
+            - __model__ # 已通过opt转好的拓扑信息模型文件
+			- __params__ # 已通过opt转好的参数信息模型文件
+      - shell
+        - CMakeLists.txt # 示例程序CMake脚本
+        - build
+          - ssd_detection # 已编译好的示例程序
+        - ssd_detection.cc # 示例程序源码
+        - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
+        - build.sh # 示例程序编译脚本
+        - run.sh # 示例程序运行脚本
+    - libs
+      - PaddleLite
+        - armhf
+          - include # PaddleLite头文件
+          - lib
+            - libvnna.so # 英特尔FPGA接口库
+            - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库（full publish模式下编译生成的库）
+  ```
+
+- 按照以下命令运行转换后的ARM+FPGA模型
+
+  ```shell
+  注意：
+  1）run.sh必须在Host机器上运行，且执行前需要配置目标设备的IP地址、SSH账号和密码；
+  2）build.sh建议在docker环境中执行，目前英特尔FPGA在PaddleLite上只支持armhf。
+
+  运行适用于英特尔FPGA的mobilenetv1全量化模型
+  $ cd PaddleLite-linux-demo/ssd_detection/shell
+  $ vim ./run.sh
+    MODEL_NAME设置为ssd_mobilenet_v1
+  $ ./run.sh
+    iter 0 cost: 3079.443115 ms
+    iter 1 cost: 3072.508057 ms
+    iter 2 cost: 3063.342041 ms
+    warmup: 1 repeat: 3, average: 3071.764404 ms, max: 3079.443115 ms, min: 3063.342041 ms
+    results: 3
+    [0] bicycle - 0.997817 0.163673,0.217786,0.721802,0.786120
+    [1] car - 0.943994 0.597238,0.131665,0.905698,0.297017
+    [2] dog - 0.959329 0.157911,0.334807,0.431497,0.920035
+    Preprocess time: 114.061000 ms
+    Prediction time: 3071.764404 ms
+    Postprocess time: 13.166000 ms
+  ```
+
+- 如果需要更改测试图片，可通过convert_to_raw_image.py工具生成；
+- 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行建议在docker环境中，否则可能编译出错。
+
+### 更新支持英特尔FPGA的PaddleLite库
+
+- 下载PaddleLite源码和英特尔FPGA的SDK
+
+  ```shell
+  $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+  $ cd Paddle-Lite
+  $ git checkout <release-version-tag>
+  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/intelfpga_sdk.tar.gz -o - | tar -zx
+  ```
+
+- 编译并生成PaddleLite+IntelFPGA的部署库
+
+  ```shell
+  For C5MB
+  full_publish
+  $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intelfpga_sdk full_publish
+  ```
+
+- 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
+
+## 其它说明

From d716f590977f0c01dfda06c4278c60c957357f8c Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 24 Mar 2021 05:28:19 +0000
Subject: [PATCH 10/19] test=develop

---
 docs/demo_guides/intel_fpga.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 2d4dc721e53..4f644e838a6 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -93,6 +93,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
         - build.sh # 示例程序编译脚本
         - run.sh # 示例程序运行脚本
+		- intelfpgadrv.ko # 英特尔FPGA启动程序
     - libs
       - PaddleLite
         - armhf

From 51533df13363ffaa74c301e91aca3d90cecb1969 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 24 Mar 2021 09:17:28 +0000
Subject: [PATCH 11/19] test=develop

---
 cmake/device/intel_fpga.cmake |  5 +++--
 lite/api/CMakeLists.txt       | 17 +----------------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake
index e753b88a5ca..4665438eb5a 100644
--- a/cmake/device/intel_fpga.cmake
+++ b/cmake/device/intel_fpga.cmake
@@ -40,7 +40,8 @@ if(NOT INTEL_FPGA_SDK_LIB)
   message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib")
 else()
   message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}")
-  link_directories(${INTEL_FPGA_SDK_ROOT}/lib)
+  add_library(intel_fpga_vnna SHARED IMPORTED GLOBAL)
+  set_property(TARGET intel_fpga_vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB})
 endif()
 
-set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs")
+set(intel_fpga_runtime_libs intel_fpga_vnna CACHE INTERNAL "intel fpga sdk runtime libs")
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index a061fa0234d..8eda4984f75 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -37,10 +37,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     if(LITE_WITH_CV)
       target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive")
     endif(LITE_WITH_CV)
-    if (LITE_WITH_INTEL_FPGA)
-      # Need to add INTEL_FPGA runtime libs  dependency
-      target_link_libraries(paddle_full_api_shared ${intel_fpga_runtime_libs})
-    endif(LITE_WITH_INTEL_FPGA)
+
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
                   DEPS ${light_lib_DEPS}
@@ -142,10 +139,6 @@ else()
             #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs})
         endif()
 
-        if (LITE_WITH_INTEL_FPGA)
-            # Need to add INTEL_FPGA runtime libs dependency
-            target_link_libraries(paddle_light_api_shared ${intel_fpga_runtime_libs})
-        endif(LITE_WITH_INTEL_FPGA)
         # 3. produce java lib from `PADDLELITE_OBJS` if LITE_WITH_JAVA=ON
         if (LITE_WITH_JAVA)
           add_library(paddle_lite_jni SHARED $<TARGET_OBJECTS:PADDLELITE_OBJS> android/jni/native/paddle_lite_jni.cc android/jni/native/tensor_jni.cc)
@@ -475,16 +468,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
-    if (LITE_WITH_INTEL_FPGA)
-      # Need to add INTEL_FPGA runtime libs dependency
-      target_link_libraries(paddle_api_full ${intel_fpga_runtime_libs})
-    endif()
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
     bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
-    if (LITE_WITH_INTEL_FPGA)
-      # Need to add INTEL_FPGA runtime libs dependency
-      target_link_libraries(paddle_api_light ${intel_fpga_runtime_libs})
-    endif()
 endif()
 
 #-----------------------------------------------------------------------------------------------------

From aaf4dfd26ab87fc23b06db1ea538a143c06bba57 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Wed, 24 Mar 2021 12:47:37 +0000
Subject: [PATCH 12/19] test=develop

---
 docs/demo_guides/intel_fpga.md                | 36 +++++++++++--------
 lite/CMakeLists.txt                           |  2 +-
 lite/api/opt.cc                               | 11 ++++--
 lite/api/opt_base.cc                          | 11 ++++--
 lite/api/python/pybind/pybind.cc              |  1 +
 lite/core/arena/CMakeLists.txt                |  2 +-
 lite/tools/build_linux.sh                     |  6 ++--
 .../cmake_tools/record_supported_kernel_op.py |  3 +-
 8 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 4f644e838a6..98ff5f1b18d 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -22,7 +22,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 已支持的Paddle模型
 
-- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/ssd_mobilenet_v1.tar.gz)
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz)
 
 ### 已支持（或部分支持）的Paddle算子
 
@@ -34,15 +34,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ## 准备工作
 
-开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/AIGO_C5MB_UG.pdf)
+开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/AIGO_C5MB_UG.pdf)
 
 ## 参考示例演示
 
-### 测试设备(Roc1开发板)
+### 测试设备(C5MB开发板)
 
-![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_front.jpg)
+![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_front.jpg)
 
-![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_back.jpg)
+![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_back.jpg)
 
 ### 准备设备环境
 
@@ -53,7 +53,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 ### 准备交叉编译环境
 
 - 按照以下两种方式配置交叉编译环境：
-  - Docker交叉编译环境：由于C5MB运行环境为Ubuntu16.04，因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image，而需要按照如下方式在Host机器上手动构建Ubuntu16.04的docker image；
+  - Docker交叉编译环境：由于C5MB运行环境为Ubuntu，因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image，而需要按照如下方式在Host机器上手动构建Ubuntu的docker image；
 
     ```
     $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile
@@ -70,7 +70,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 运行图像检测示例程序
 
-- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/PaddleLite-linux-demo.tar.gz)，解压后清单如下：
+- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/PaddleLite-linux-demo.tar.gz)，解压后清单如下：
 
   ```shell
   - PaddleLite-linux-demo
@@ -93,13 +93,14 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
         - build.sh # 示例程序编译脚本
         - run.sh # 示例程序运行脚本
-		- intelfpgadrv.ko # 英特尔FPGA启动程序
+		- intelfpgadrv.ko # 英特尔FPGA内核驱动程序
     - libs
       - PaddleLite
         - armhf
           - include # PaddleLite头文件
           - lib
             - libvnna.so # 英特尔FPGA接口库
+			- libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库（tiny publish模式下编译生成的库）
             - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库（full publish模式下编译生成的库）
   ```
 
@@ -139,18 +140,25 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
   $ cd Paddle-Lite
   $ git checkout <release-version-tag>
-  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/intelfpga_sdk.tar.gz -o - | tar -zx
+  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/intel_fpga_sdk.tar.gz -o - | tar -zx
   ```
 
 - 编译并生成PaddleLite+IntelFPGA的部署库
 
-  ```shell
   For C5MB
-  full_publish
-  $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intelfpga_sdk full_publish
+  - tiny_publish编译方式
+    ```shell
+    $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk
+
+    将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+	```
+  - full_publish编译方式
+  ```shell
+  $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk full_publish
+  
+  将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
   ```
 
-- 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
-- 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
+  - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；  
 
 ## 其它说明
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 4a4f8af5848..47911ad5c52 100755
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -135,7 +135,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna")
     endif(LITE_WITH_IMAGINATION_NNA)
     if (LITE_WITH_INTEL_FPGA)
-        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intelfpga")
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intel_fpga")
     endif(LITE_WITH_INTEL_FPGA)
 else()
     set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index 215beddda46..fa8d75d3e87 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -155,6 +155,10 @@ std::vector<Place> ParserValidPlaces(bool enable_fp16) {
       valid_places.emplace_back(TARGET(kImaginationNNA));
       valid_places.emplace_back(
           Place{TARGET(kImaginationNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)});
+    } else if (target_repr == "intel_fpga") {
+      valid_places.emplace_back(TARGET(kIntelFPGA));
+      valid_places.emplace_back(
+          Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)});
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
@@ -245,6 +249,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                       "kAPU",
                                       "kHuaweiAscendNPU",
                                       "kImaginationNNA",
+                                      "kIntelFPGA",
                                       "kAny",
                                       "kUnk"};
   size_t maximum_optype_length = 0;
@@ -316,7 +321,7 @@ void PrintHelpInfo() {
       "        "
       "`--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_"
       "ascend_npu|"
-      "imagination_nna)`\n"
+      "imagination_nna|intel_fpga)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of mode quantization in opt:\n"
       "        `--quant_model=(true|false)`\n"
@@ -329,13 +334,13 @@ void PrintHelpInfo() {
       "        `--print_supported_ops=true  "
       "--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_"
       "ascend_npu|"
-      "imagination_nna)"
+      "imagination_nna|intel_fpga)"
       "`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
       "--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_"
       "ascend_npu|"
-      "imagination_nna)"
+      "imagination_nna|intel_fpga)"
       "`"
       "  Display operators in the input model\n";
   std::cout << "opt version:" << opt_version << std::endl
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index 540d5c831c5..f3e4b21269c 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -107,6 +107,10 @@ void OptBase::SetValidPlaces(const std::string& valid_places,
       valid_places_.emplace_back(TARGET(kImaginationNNA));
       valid_places_.emplace_back(
           Place{TARGET(kImaginationNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)});
+    } else if (target_repr == "intel_fpga") {
+      valid_places.emplace_back(TARGET(kIntelFPGA));
+      valid_places.emplace_back(
+          Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)});
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
@@ -304,7 +308,7 @@ void OptBase::PrintExecutableBinHelpInfo() {
       "        `--optimize_out=<output_optimize_model_dir>`\n"
       "        "
       "`--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_"
-      "nna)`\n"
+      "nna|intel_fpga)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of mode quantization in opt:\n"
       "        `--quant_model=(true|false)`\n"
@@ -316,11 +320,11 @@ void OptBase::PrintExecutableBinHelpInfo() {
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
       "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_"
-      "nna)`"
+      "nna|intel_fpga)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
       "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_"
-      "nna)`"
+      "nna|intel_fpga)`"
       "  Display operators in the input model\n";
   std::cout << "paddlelite opt version:" << opt_version << std::endl
             << help_info << std::endl;
@@ -340,6 +344,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                      "kAPU",
                                                      "kHuaweiAscendNPU",
                                                      "kImaginationNNA",
+                                                     "kIntelFPGA",
                                                      "kAny",
                                                      "kUnk"};
   // Get the lengh of the first column: maximum length of the op_type
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index c4b9ce6d523..68feb551535 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -202,6 +202,7 @@ void BindLitePlace(py::module *m) {
       .value("APU", TargetType::kAPU)
       .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
       .value("IMAGINATION_NNA", TargetType::kImaginationNNA)
+      .value("INTEL_FPGA", TargetType::kIntelFPGA)
       .value("Any", TargetType::kAny);
 
   // PrecisionType
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index 9c86bf2649d..6441c1eee25 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${apu_kernels} ${huawei_ascend_npu_kernels} ${imagination_nna_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${apu_kernels} ${huawei_ascend_npu_kernels} ${imagination_nna_kernels} ${intel_fpga_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 0a62f90793a..7e23cf08baf 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -5,7 +5,7 @@ set -e
 # 1. global variables, you can change them according to your requirements
 #####################################################################################################
 # armv8 or armv7hf or armv7, default armv8.
-ARCH=armv7hf
+ARCH=armv8
 # gcc or clang, default gcc.
 TOOLCHAIN=gcc
 # ON or OFF, default OFF.
@@ -34,8 +34,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk"
 WITH_BAIDU_XPU=OFF
 BAIDU_XPU_SDK_ROOT=""
 # options of compiling intel fpga.
-WITH_INTEL_FPGA=ON
-INTEL_FPGA_SDK_ROOT="$(pwd)/intelfpga_sdk" 
+WITH_INTEL_FPGA=OFF
+INTEL_FPGA_SDK_ROOT="$(pwd)/intel_fpga_sdk" 
 # options of adding training ops
 WITH_TRAIN=OFF
 # num of threads used during compiling..
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index 0e36569f501..36343f3292b 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,7 +56,7 @@
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA"]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","KIntelFPGA"]
 valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
@@ -75,6 +75,7 @@ class TargetType:
     kAPU = 13
     kHuaweiAscendNPU = 14
     kImaginationNNA = 15
+    kIntelFPGA = 16
 
 
 # record op_info of valid kernels into `valid_ops` according to different target type

From 063e302e01ccabc656a01b8d4cd056b547dd5679 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Thu, 25 Mar 2021 05:39:36 +0000
Subject: [PATCH 13/19] test=develop

---
 docs/demo_guides/intel_fpga.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 98ff5f1b18d..f564fc8707d 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -18,11 +18,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 已支持的设备
 
-- 海运捷讯C5MB开发板
+- 海运捷讯C5MB（英特尔FPGA Cyclone V）开发板
+- 海运捷讯C5CB（英特尔FPGA Cyclone V）开发板
+- 海运捷讯C5TB（英特尔FPGA Cyclone V）开发板
 
 ### 已支持的Paddle模型
 
-- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz)
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/mobilenet_v1.tar.gz)
+- [全量化SSD_MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz)
+- [全量化YOLOV3](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/yolov3.tar.gz)
 
 ### 已支持（或部分支持）的Paddle算子
 
@@ -56,7 +60,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   - Docker交叉编译环境：由于C5MB运行环境为Ubuntu，因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image，而需要按照如下方式在Host机器上手动构建Ubuntu的docker image；
 
     ```
-    $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile
+    $ wget https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/Dockerfile
     $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 .
     $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash
     ```
@@ -82,9 +86,10 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - labels
           - pascalvoc_label_list # 检测label文件
         - models
-          - ssd_mobilenet_v1 # Non-combined格式的、SSD量化模型
+          - ssd_mobilenet_v1 # Combined格式的protobuf量化模型
             - __model__ # 已通过opt转好的拓扑信息模型文件
 			- __params__ # 已通过opt转好的参数信息模型文件
+		  - ssd_mobilenet_v1.nb # 已通过opt转好的、适合ARM CPU的naive_buffer量化模型
       - shell
         - CMakeLists.txt # 示例程序CMake脚本
         - build
@@ -99,7 +104,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - armhf
           - include # PaddleLite头文件
           - lib
-            - libvnna.so # 英特尔FPGA接口库
+            - libvnna.so # 英特尔FPGA推理运行时库
 			- libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库（tiny publish模式下编译生成的库）
             - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库（full publish模式下编译生成的库）
   ```

From 71568a2388857f3a95b020a82d50075338620f6b Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Thu, 25 Mar 2021 05:54:47 +0000
Subject: [PATCH 14/19] test=develop

---
 docs/demo_guides/intel_fpga.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index f564fc8707d..8e0cc037e87 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -155,7 +155,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
     ```shell
     $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk
 
-    将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+    将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件；
 	```
   - full_publish编译方式
   ```shell

From de84566917250f92f2916452420ab8d6cd957d7d Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Fri, 26 Mar 2021 10:39:50 +0000
Subject: [PATCH 15/19] test=develop

---
 docs/demo_guides/intel_fpga.md                | 53 ++++++++++---------
 lite/api/opt_base.cc                          |  2 +-
 .../cmake_tools/record_supported_kernel_op.py |  4 +-
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 8e0cc037e87..b9f59000874 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -24,9 +24,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 已支持的Paddle模型
 
-- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/mobilenet_v1.tar.gz)
-- [全量化SSD_MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz)
-- [全量化YOLOV3](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/yolov3.tar.gz)
+- [ssd_mobilenet_v1_pascalvoc](https://https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)
 
 ### 已支持（或部分支持）的Paddle算子
 
@@ -38,15 +36,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ## 准备工作
 
-开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/AIGO_C5MB_UG.pdf)
+开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel/AIGO_C5MB_UG.pdf)
 
 ## 参考示例演示
 
 ### 测试设备(C5MB开发板)
 
-![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_front.jpg)
+![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel/c5mb_front.jpg)
 
-![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_back.jpg)
+![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel/c5mb_back.jpg)
 
 ### 准备设备环境
 
@@ -56,16 +54,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 准备交叉编译环境
 
-- 按照以下两种方式配置交叉编译环境：
-  - Docker交叉编译环境：由于C5MB运行环境为Ubuntu，因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image，而需要按照如下方式在Host机器上手动构建Ubuntu的docker image；
-
-    ```
-    $ wget https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/Dockerfile
-    $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 .
-    $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash
-    ```
-
-- Ubuntu交叉编译环境：要求Host为Ubuntu16.04系统，参考[编译环境准备](../source_compile/compile_env)中的"交叉编译ARM Linux"步骤安装交叉编译工具链。
+- 为了保证编译环境一致，建议参考[编译环境准备](../source_compile/compile_env)中的Docker开发环境进行配置；
 - 由于需要通过scp和ssh命令将交叉编译生成的PaddleLite库和示例程序传输到设备上执行，因此，在进入Docker容器后还需要安装如下软件：
 
   ```
@@ -74,7 +63,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 运行图像检测示例程序
 
-- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/PaddleLite-linux-demo.tar.gz)，解压后清单如下：
+- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel/PaddleLite-linux-demo.tar.gz)，解压后清单如下：
 
   ```shell
   - PaddleLite-linux-demo
@@ -86,10 +75,8 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - labels
           - pascalvoc_label_list # 检测label文件
         - models
-          - ssd_mobilenet_v1 # Combined格式的protobuf量化模型
-            - __model__ # 已通过opt转好的拓扑信息模型文件
-			- __params__ # 已通过opt转好的参数信息模型文件
-		  - ssd_mobilenet_v1.nb # 已通过opt转好的、适合ARM CPU的naive_buffer量化模型
+		  - ssd_mobilenet_v1_fp32_300_for_intel_fpga
+		    - model.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型
       - shell
         - CMakeLists.txt # 示例程序CMake脚本
         - build
@@ -116,10 +103,10 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   1）run.sh必须在Host机器上运行，且执行前需要配置目标设备的IP地址、SSH账号和密码；
   2）build.sh建议在docker环境中执行，目前英特尔FPGA在PaddleLite上只支持armhf。
 
-  运行适用于英特尔FPGA的mobilenetv1全量化模型
+  运行适用于英特尔FPGA的ssd_mobilenet_v1量化模型
   $ cd PaddleLite-linux-demo/ssd_detection/shell
   $ vim ./run.sh
-    MODEL_NAME设置为ssd_mobilenet_v1
+    MODEL_NAME设置为ssd_mobilenet_v1_fp32_300_for_intel_fpga
   $ ./run.sh
     iter 0 cost: 3079.443115 ms
     iter 1 cost: 3072.508057 ms
@@ -137,6 +124,22 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 - 如果需要更改测试图片，可通过convert_to_raw_image.py工具生成；
 - 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行建议在docker环境中，否则可能编译出错。
 
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到MobileNetv1 foat32模型[ssd_mobilenet_v1_fp32_300_fluid](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)；
+- 参考[模型转化方法](../user_guides/model_optimize_tool)，利用opt工具转换生成英特尔FPGA模型，仅需要将valid_targets设置为intel_fpga,arm即可。
+  ```shell
+  $ ./opt --model_dir=ssd_mobilenet_v1_fp32_300_for_intel_fpga \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=opt_model \
+      --valid_targets=intel_fpga,arm
+  
+  替换自带的英特尔FPGA模型
+  $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/model.nb
+  ```
+
+- 注意：opt生成的模型只是标记了英特尔FPGA支持的Paddle算子，并没有真正生成英特尔FPGA模型，只有在执行时才会将标记的Paddle算子转成英特尔FPGA的APIs，最终生成并执行模型。
+
 ### 更新支持英特尔FPGA的PaddleLite库
 
 - 下载PaddleLite源码和英特尔FPGA的SDK
@@ -145,7 +148,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
   $ cd Paddle-Lite
   $ git checkout <release-version-tag>
-  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/intel_fpga_sdk.tar.gz -o - | tar -zx
+  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk.tar.gz -o - | tar -zx
   ```
 
 - 编译并生成PaddleLite+IntelFPGA的部署库
@@ -164,6 +167,6 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
   ```
 
-  - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；  
+  - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；  
 
 ## 其它说明
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index f3e4b21269c..09a4ac30d90 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -269,7 +269,7 @@ void OptBase::PrintHelpInfo() {
       "        `set_lite_out(output_optimize_model_dir)`\n"
       "        "
       "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|"
-      "imagination_nna)`\n"
+      "imagination_nna|intel_fpga)`\n"
       "        `record_model_info(false|true)`: refer to whether to record ops "
       "info for striping lib, false by default`\n"
       "        `run() : start model transformation`\n"
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index 36343f3292b..4ec0d3a2689 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","KIntelFPGA"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","kIntelFPGA"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
     kHost = 1

From e39781d91e7210dcfd254aa2abeea3383a46681d Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Mon, 29 Mar 2021 09:53:59 +0000
Subject: [PATCH 16/19] test=develop

---
 lite/kernels/intel_fpga/conv_compute.cc  | 4 ++--
 lite/kernels/intel_fpga/conv_gemmlike.cc | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lite/kernels/intel_fpga/conv_compute.cc b/lite/kernels/intel_fpga/conv_compute.cc
index 763ca83c7a2..4d2d55feca3 100644
--- a/lite/kernels/intel_fpga/conv_compute.cc
+++ b/lite/kernels/intel_fpga/conv_compute.cc
@@ -59,10 +59,10 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   /// select conv impl
   if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    // VLOG(3) << "invoking dw conv";
+    VLOG(3) << "[IntelFPGA] invoking depthwise conv";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    // VLOG(3) << "invoking gemm like conv";
+    VLOG(3) << "[IntelFPGA] invoking common conv";
   }
   if (!arm_cxt_) {
     arm_cxt_ = ContextScheduler::Global().NewContext(TargetType::kARM);
diff --git a/lite/kernels/intel_fpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc
index 849dabc3dcf..8dfcde783b9 100644
--- a/lite/kernels/intel_fpga/conv_gemmlike.cc
+++ b/lite/kernels/intel_fpga/conv_gemmlike.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "lite/backends/arm/math/gemm_prepacked_int8.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
+#include "lite/utils/logging.h"
 
 namespace paddle {
 namespace lite {
@@ -109,7 +110,7 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
     conv.op.oh = o_dims[2];
     conv.op.ow = o_dims[3];
     if (intelfpga_conv2d(&conv)) {
-      std::cout << "intel_fpga_conv error" << std::endl;
+      LOG(WARNING) << "[IntelFPGA] Conv_Compute failed";
     }
   } else {
     if (flag_1x1gemm_) {

From f5a0046884a53e55bb88d64a8831ee7f89375d2e Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Mon, 29 Mar 2021 11:23:07 +0000
Subject: [PATCH 17/19] test=develop

---
 docs/demo_guides/intel_fpga.md              |  13 +-
 docs/introduction/support_hardware.md       |   5 +
 docs/introduction/support_operation_list.md | 412 ++++++++++----------
 3 files changed, 218 insertions(+), 212 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index b9f59000874..187f82f563b 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -28,15 +28,16 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 已支持（或部分支持）的Paddle算子
 
-- relu/relu6/leakyrelu
 - conv2d
 - depthwise_conv2d
-- pool2d
-- fc
 
 ## 准备工作
 
 开发板C5MB可以通过串口线进行连接，也可以通过ssh进行连接，初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel/AIGO_C5MB_UG.pdf)
+可以通过串口完成C5MB开发板的IP修改：
+  ```
+  $ vi /etc/network/interfaces # 设备网络配置文件，将对应的address，netmask，和gateway设置为指定的地址即可。
+  ```
 
 ## 参考示例演示
 
@@ -76,7 +77,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
           - pascalvoc_label_list # 检测label文件
         - models
 		  - ssd_mobilenet_v1_fp32_300_for_intel_fpga
-		    - model.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型
+		    - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型
       - shell
         - CMakeLists.txt # 示例程序CMake脚本
         - build
@@ -85,7 +86,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
         - build.sh # 示例程序编译脚本
         - run.sh # 示例程序运行脚本
-		- intelfpgadrv.ko # 英特尔FPGA内核驱动程序
+        - intelfpgadrv.ko # 英特尔FPGA内核驱动程序
     - libs
       - PaddleLite
         - armhf
@@ -135,7 +136,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
       --valid_targets=intel_fpga,arm
   
   替换自带的英特尔FPGA模型
-  $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/model.nb
+  $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb
   ```
 
 - 注意：opt生成的模型只是标记了英特尔FPGA支持的Paddle算子，并没有真正生成英特尔FPGA模型，只有在执行时才会将标记的Paddle算子转成英特尔FPGA的APIs，最终生成并执行模型。
diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md
index 3b72212eb87..59e103db752 100644
--- a/docs/introduction/support_hardware.md
+++ b/docs/introduction/support_hardware.md
@@ -56,6 +56,11 @@ Paddle Lite支持 瑞芯微 (Rockchip) NPU，支持列表如下：
 - 支持芯片：RK1808, RK1806，暂不支持RK3399Pro
 - 支持设备：RK1808/1806 EVB，TB-RK1808S0
 
+## 英特尔 (Intel) FPGA
+Paddle Lite支持 英特尔 (Inel) FPGA，支持列表如下：
+- 支持芯片：Cyclone V
+- 支持设备：C5MB，C5TB和C5CB
+
 ## 联发科 (MediaTek) APU
 Paddle Lite支持 联发科 (MediaTek) APU，支持列表如下：
 - 支持芯片：MT8168/MT8175，及其他智能芯片
diff --git a/docs/introduction/support_operation_list.md b/docs/introduction/support_operation_list.md
index 14d49dc7c9c..4605e67e699 100644
--- a/docs/introduction/support_operation_list.md
+++ b/docs/introduction/support_operation_list.md
@@ -10,217 +10,217 @@ Host端Kernel是算子在任意CPU上纯C/C++的具体实现，具有可移植
 
 举例PaddleLite在ARM上部署模型，如果模型中某个算子没有ARM端Kernel，但是有Host端Kerenel，那么模型优化阶段该算子会选择Host端Kerenel，该模型还是可以顺利部署。
 
-| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 颖脉NNA |
-|-:|-|-|-|-|-|-|-|-|-|-|-|
-| affine_channel | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| affine_grid | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| arg_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| assign_value | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| batch_norm | 　 | Y | 　 | Y | 　 | 　 | Y | Y | Y | 　 |　 |
-| bilinear_interp | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |
-| box_coder | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |　 |
-| calib | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |　 |
-| cast | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |
-| concat | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | Y |　|
-| conv2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y |
-| conv2d_transpose | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | Y |　 |
-| density_prior_box | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| depthwise_conv2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y |
-| depthwise_conv2d_transpose | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| dropout | 　 | Y | Y | Y | Y | Y | Y | Y | 　 | 　 |　 |
-| elementwise_add | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　 |
-| elementwise_div | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | Y | 　 |　 |
-| elementwise_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| elementwise_mod | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| elementwise_mul | 　 | Y | Y | Y | Y | Y | Y | 　 | Y | Y |　 |
-| elementwise_pow | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| elementwise_sub | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | 　 |　 |
-| elu | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| expand | Y | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 |　 |
-| expand_as | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| fc | 　 | Y | Y | Y | Y | Y | Y | 　 | Y | Y |　Y |
-| feed | Y | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |
-| fetch | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |
-| fill_constant | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| fill_constant_batch_size_like | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| flatten | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 | Y | 　 |　 |
-| flatten2 | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 | Y | 　 |　 |
-| fusion_elementwise_add_activation | 　 | 　 | Y | Y | Y | Y | Y | 　 | 　 | Y  |　 |
-| fusion_elementwise_div_activation | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| fusion_elementwise_max_activation | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| fusion_elementwise_mul_activation | 　 | 　 | Y | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| fusion_elementwise_sub_activation | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |
-| grid_sampler | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |　 |
-| instance_norm | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 |　 |
-| io_copy | 　 | 　 | Y | 　 | Y | Y | 　 | 　 | 　 | 　 |　 |
-| io_copy_once | 　 | 　 | Y | 　 | Y | Y | 　 | 　 | 　 | 　 |　 |
-| layout | 　 | 　 | Y | Y | Y | Y | 　 | 　 | 　 | 　 |　 |
-| leaky_relu | 　 | Y | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |
-| matmul | 　 | Y | Y | Y | 　 | 　 | Y | Y | 　 | 　 |　 |
-| mul | 　 | Y | Y | Y | 　 | 　 | Y | Y | 　 | 　 |　 |
-| multiclass_nms | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |
-| multiclass_nms2 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| nearest_interp | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |
-| pad2d | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | Y | 　 |　 |
-| pool2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y |
-| prelu | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| prior_box | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 |　 |
-| range | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| reduce_mean | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| relu | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | Y |　Y |
-| relu6 | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | Y | 　 |　 |
-| reshape | Y | Y | 　 | 　 | Y | 　 | Y | Y | 　 | 　 |　 |
-| reshape2 | Y | Y | 　 | 　 | Y | 　 | Y | Y | Y | 　 |　 |
-| scale | 　 | Y | Y | Y | Y | Y | Y | Y | Y | 　 |　 |
-| search_fc | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| sequence_topk_avg_pooling | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| shuffle_channel | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| sigmoid | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | 　 |　 |
-| slice | 　 | Y | 　 | Y | Y | 　 | 　 | Y | 　 | 　 |　 |
-| softmax | 　 | Y | Y | Y | 　 | 　 | Y | Y | Y | Y |　 |
-| split | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| squeeze | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| squeeze2 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |
-| stack | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |
-| subgraph | 　 | 　 | 　 | 　 | 　 | 　 | Y | Y | Y | Y |　 |
-| tanh | 　 | Y | Y | Y | Y | 　 | Y | Y | 　 | 　 |　 |
-| thresholded_relu | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| transpose | 　 | Y | Y | Y | Y | 　 | Y | Y | 　 | 　 |　 |
-| transpose2 | 　 | Y | Y | Y | Y | 　 | Y | Y | Y | 　 |　 |
-| unsqueeze | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| unsqueeze2 | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |　 |
-| yolo_box | 　 | 　 | Y | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |
+| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 颖脉NNA | 英特尔FPGA |
+|-:|-|-|-|-|-|-|-|-|-|-|-|-|
+| affine_channel | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| affine_grid | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| arg_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| assign_value | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| batch_norm | 　 | Y | 　 | Y | 　 | 　 | Y | Y | Y | 　 |　 |　 |
+| bilinear_interp | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| box_coder | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| calib | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |　 |　 |
+| cast | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |　 |
+| concat | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | Y |　|　 |
+| conv2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y | Y |
+| conv2d_transpose | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | Y |　 |　 |
+| density_prior_box | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| depthwise_conv2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y | Y |
+| depthwise_conv2d_transpose | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| dropout | 　 | Y | Y | Y | Y | Y | Y | Y | 　 | 　 |　 |　 |
+| elementwise_add | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　 |　 |
+| elementwise_div | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | Y | 　 |　 |　 |
+| elementwise_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| elementwise_mod | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| elementwise_mul | 　 | Y | Y | Y | Y | Y | Y | 　 | Y | Y |　 |　 |
+| elementwise_pow | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| elementwise_sub | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | 　 |　 |　 |
+| elu | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| expand | Y | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| expand_as | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| fc | 　 | Y | Y | Y | Y | Y | Y | 　 | Y | Y |　Y |　 |
+| feed | Y | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |　 |
+| fetch | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |　 |
+| fill_constant | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| fill_constant_batch_size_like | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| flatten | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 | Y | 　 |　 |　 |
+| flatten2 | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 | Y | 　 |　 |　 |
+| fusion_elementwise_add_activation | 　 | 　 | Y | Y | Y | Y | Y | 　 | 　 | Y  |　 |　 |
+| fusion_elementwise_div_activation | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| fusion_elementwise_max_activation | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| fusion_elementwise_mul_activation | 　 | 　 | Y | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| fusion_elementwise_sub_activation | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| grid_sampler | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| instance_norm | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| io_copy | 　 | 　 | Y | 　 | Y | Y | 　 | 　 | 　 | 　 |　 |　 |
+| io_copy_once | 　 | 　 | Y | 　 | Y | Y | 　 | 　 | 　 | 　 |　 |　 |
+| layout | 　 | 　 | Y | Y | Y | Y | 　 | 　 | 　 | 　 |　 |　 |
+| leaky_relu | 　 | Y | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| matmul | 　 | Y | Y | Y | 　 | 　 | Y | Y | 　 | 　 |　 |　 |
+| mul | 　 | Y | Y | Y | 　 | 　 | Y | Y | 　 | 　 |　 |　 |
+| multiclass_nms | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |　 |　 |
+| multiclass_nms2 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| nearest_interp | 　 | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 |　 |　 |
+| pad2d | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | Y | 　 |　 |　 |
+| pool2d | 　 | Y | Y | Y | Y | Y | Y | Y | Y | Y |　Y |　 |
+| prelu | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| prior_box | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 |　 |　 |
+| range | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| reduce_mean | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| relu | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | Y |　Y |　 |
+| relu6 | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | Y | 　 |　 |　 |
+| reshape | Y | Y | 　 | 　 | Y | 　 | Y | Y | 　 | 　 |　 |　 |
+| reshape2 | Y | Y | 　 | 　 | Y | 　 | Y | Y | Y | 　 |　 |　 |
+| scale | 　 | Y | Y | Y | Y | Y | Y | Y | Y | 　 |　 |　 |
+| search_fc | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| sequence_topk_avg_pooling | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| shuffle_channel | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| sigmoid | 　 | Y | Y | Y | Y | 　 | Y | 　 | Y | 　 |　 |　 |
+| slice | 　 | Y | 　 | Y | Y | 　 | 　 | Y | 　 | 　 |　 |　 |
+| softmax | 　 | Y | Y | Y | 　 | 　 | Y | Y | Y | Y |　 |　 |
+| split | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| squeeze | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| squeeze2 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |　 |　 |
+| stack | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |　 |
+| subgraph | 　 | 　 | 　 | 　 | 　 | 　 | Y | Y | Y | Y |　 |　 |
+| tanh | 　 | Y | Y | Y | Y | 　 | Y | Y | 　 | 　 |　 |　 |
+| thresholded_relu | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| transpose | 　 | Y | Y | Y | Y | 　 | Y | Y | 　 | 　 |　 |　 |
+| transpose2 | 　 | Y | Y | Y | Y | 　 | Y | Y | Y | 　 |　 |　 |
+| unsqueeze | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| unsqueeze2 | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |　 |　 |
+| yolo_box | 　 | 　 | Y | Y | 　 | 　 | 　 | Y | 　 | 　 |　 |　 |
 
 
 ### 附加算子
 
 附加算子共计127个，需要在编译时打开`--build_extra=ON`开关才会编译，具体请参考[参数详情](../source_compile/library)。
 
-| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU |
-|-:|-|-|-|-|-|-|-|-|-|-|
-| abs | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| anchor_generator | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
+| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 英特尔FPGA |
+|-:|-|-|-|-|-|-|-|-|-|-|-|
+| abs | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| anchor_generator | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
 | assign | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| attention_padding_mask | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| axpy | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| beam_search_decode | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| beam_search_decode | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| box_clip | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| calib_once | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |
-| clip | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| collect_fpn_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| conditional_block | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| crf_decoding | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| crop | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| ctc_align | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| decode_bboxes | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| deformable_conv | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| distribute_fpn_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| exp | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |
-| fake_channel_wise_dequantize_max_abs | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_dequantize_max_abs | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_quantize_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_quantize_dequantize_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_quantize_dequantize_moving_average_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_quantize_moving_average_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| fake_quantize_range_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| floor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| gather | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 |
-| gelu | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| generate_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| greater_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| greater_than | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| group_norm | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| gru | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |
-| gru_unit | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| hard_sigmoid | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 |
-| hard_swish | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| im2sequence | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| increment | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |
-| is_empty | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| layer_norm | 　 | Y | 　 | Y | 　 | 　 | Y | Y | 　 | 　 |
-| layout_once | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |
-| less_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| less_than | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
-| lod_reset | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| log | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |
-| logical_and | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| logical_not | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| logical_or | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| logical_xor | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| lookup_table | 　 | Y | Y | Y | 　 | 　 | 　 | Y | 　 | 　 |
-| lookup_table_dequant | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| lookup_table_v2 | 　 | Y | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| lrn | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |
-| lstm | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| match_matrix_tensor | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| max_pool2d_with_index | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| mean | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| merge_lod_tensor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| negative | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| norm | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 |
-| not_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| one_hot | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| pixel_shuffle | Y | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |
-| pow | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| print | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| read_from_array | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| reciprocal | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| reduce_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| reduce_prod | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| reduce_sum | 　 | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| relu_clipped | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |
-| retinanet_detection_output | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| roi_align | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| rsqrt | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_aligned_mat_mul | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_attention_padding_mask | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_grnn | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_group_padding | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_seq_arithmetic | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_seq_depadding | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_seq_fc | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| search_seq_softmax | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_arithmetic | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_concat | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_conv | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_expand | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_expand_as | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_mask | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_pad | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_pool | 　 | Y | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_pool_concat | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_reshape | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_reverse | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_reverse_embedding | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_softmax | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| sequence_unpad |  Y　|  | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| shape | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| sign | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| softsign | 　 | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
-| split_lod_tensor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| sqrt | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
-| square | 　 | Y | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 |
-| swish | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 |
-| top_k | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 |
-| topk_pooling | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| uniform_random | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| var_conv_2d | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| where_index | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| while | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| write_to_array | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
-| __xpu__conv2d | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__embedding_with_eltwise_add | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__fc | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__mmdnn_bid_emb_att | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__mmdnn_bid_emb_grnn_att | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__mmdnn_bid_emb_grnn_att2 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__mmdnn_match_conv_topk | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
+| attention_padding_mask | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| axpy | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| beam_search_decode | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| beam_search_decode | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| box_clip | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| calib_once | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 |
+| clip | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| collect_fpn_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| conditional_block | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| crf_decoding | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| crop | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| ctc_align | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| decode_bboxes | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| deformable_conv | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| distribute_fpn_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| exp | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_channel_wise_dequantize_max_abs | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_dequantize_max_abs | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_quantize_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_quantize_dequantize_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_quantize_dequantize_moving_average_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_quantize_moving_average_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| fake_quantize_range_abs_max | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| floor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| gather | 　 | Y | 　 | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| gelu | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| generate_proposals | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| greater_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| greater_than | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| group_norm | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| gru | 　 | Y | Y | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 |
+| gru_unit | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| hard_sigmoid | 　 | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 |
+| hard_swish | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| im2sequence | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| increment | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| is_empty | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| layer_norm | 　 | Y | 　 | Y | 　 | 　 | Y | Y | 　 | 　 | 　 |
+| layout_once | 　 | 　 | Y | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 |
+| less_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| less_than | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| lod_reset | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| log | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| logical_and | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| logical_not | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| logical_or | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| logical_xor | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| lookup_table | 　 | Y | Y | Y | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| lookup_table_dequant | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| lookup_table_v2 | 　 | Y | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| lrn | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
+| lstm | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| match_matrix_tensor | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| max_pool2d_with_index | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| mean | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| merge_lod_tensor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| negative | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| norm | 　 | 　 | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 |
+| not_equal | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| one_hot | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| pixel_shuffle | Y | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
+| pow | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| print | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| read_from_array | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| reciprocal | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| reduce_max | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| reduce_prod | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| reduce_sum | 　 | Y | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| relu_clipped | 　 | 　 | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| retinanet_detection_output | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| roi_align | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| rsqrt | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_aligned_mat_mul | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_attention_padding_mask | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_grnn | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_group_padding | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_seq_arithmetic | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_seq_depadding | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_seq_fc | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| search_seq_softmax | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_arithmetic | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_concat | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_conv | 　 | Y | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_expand | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_expand_as | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_mask | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_pad | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_pool | 　 | Y | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_pool_concat | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_reshape | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_reverse | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_reverse_embedding | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_softmax | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sequence_unpad |  Y　|  | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| shape | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sign | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| softsign | 　 | Y | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| split_lod_tensor | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| sqrt | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| square | 　 | Y | 　 | Y | 　 | 　 | Y | 　 | 　 | 　 | 　 |
+| swish | 　 | 　 | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 |
+| top_k | 　 | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| topk_pooling | 　 | 　 | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| uniform_random | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| var_conv_2d | 　 | Y | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| where_index | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| while | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| write_to_array | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| __xpu__conv2d | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__embedding_with_eltwise_add | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__fc | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__mmdnn_bid_emb_att | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__mmdnn_bid_emb_grnn_att | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__mmdnn_bid_emb_grnn_att2 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__mmdnn_match_conv_topk | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
 | __xpu__mmdnn_merge_all | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__mmdnn_search_attention | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__multi_encoder | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__resnet_cbam | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__resnet50 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| __xpu__sfa_head | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 |
-| matrix_nms | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |
+| __xpu__mmdnn_search_attention | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__multi_encoder | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__resnet_cbam | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__resnet50 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| __xpu__sfa_head | 　 | 　 | 　 | 　 | 　 | 　 | 　 | Y | 　 | 　 | 　 |
+| matrix_nms | Y | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 | 　 |

From c22d58b23f63c4549aee6f3838156661b149fbc5 Mon Sep 17 00:00:00 2001
From: xbeu <youyq@awcloud.com>
Date: Tue, 30 Mar 2021 09:17:34 +0000
Subject: [PATCH 18/19] test=develop

---
 docs/demo_guides/intel_fpga.md | 4 ++--
 lite/tools/build_linux.sh      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 187f82f563b..0c388c1c453 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -24,7 +24,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
 
 ### 已支持的Paddle模型
 
-- [ssd_mobilenet_v1_pascalvoc](https://https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)
+- [ssd_mobilenet_v1_pascalvoc](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)
 
 ### 已支持（或部分支持）的Paddle算子
 
@@ -149,7 +149,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
   $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
   $ cd Paddle-Lite
   $ git checkout <release-version-tag>
-  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk.tar.gz -o - | tar -zx
+  $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk_1.0.0.tar.gz -o - | tar -zx
   ```
 
 - 编译并生成PaddleLite+IntelFPGA的部署库
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 7e23cf08baf..77c628d96e0 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -9,7 +9,7 @@ ARCH=armv8
 # gcc or clang, default gcc.
 TOOLCHAIN=gcc
 # ON or OFF, default OFF.
-WITH_EXTRA=ON
+WITH_EXTRA=OFF
 # controls whether to compile python lib, default is OFF.
 WITH_PYTHON=OFF
 PY_VERSION=""

From 709796428b54d9f920038450003f3022d6835c7f Mon Sep 17 00:00:00 2001
From: YIQUAN YOU <79895409+xbeu@users.noreply.github.com>
Date: Tue, 30 Mar 2021 19:06:59 +0800
Subject: [PATCH 19/19] Update intel_fpga.md

---
 docs/demo_guides/intel_fpga.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md
index 0c388c1c453..2813893af17 100644
--- a/docs/demo_guides/intel_fpga.md
+++ b/docs/demo_guides/intel_fpga.md
@@ -76,8 +76,8 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
         - labels
           - pascalvoc_label_list # 检测label文件
         - models
-		  - ssd_mobilenet_v1_fp32_300_for_intel_fpga
-		    - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型
+          - ssd_mobilenet_v1_fp32_300_for_intel_fpga
+          - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型
       - shell
         - CMakeLists.txt # 示例程序CMake脚本
         - build
@@ -93,7 +93,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理，其主要特
           - include # PaddleLite头文件
           - lib
             - libvnna.so # 英特尔FPGA推理运行时库
-			- libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库（tiny publish模式下编译生成的库）
+            - libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库（tiny publish模式下编译生成的库）
             - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库（full publish模式下编译生成的库）
   ```