From c5b3c1dd1c6d5ead00e8b5a077f5a149f87f6e01 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 17 Mar 2021 07:14:24 +0000 Subject: [PATCH 01/19] test=develop --- CMakeLists.txt | 1 + cmake/configure.cmake | 4 + cmake/lite.cmake | 33 ++- docs/demo_guides/intel_fpga.md | 107 ++++++++++ lite/api/CMakeLists.txt | 22 +- lite/backends/CMakeLists.txt | 1 + lite/backends/intelfpga/CMakeLists.txt | 23 +++ .../backends/intelfpga/lldrv/intelfpgadrv.cpp | 192 ++++++++++++++++++ lite/backends/intelfpga/lldrv/intelfpgadrv.h | 186 +++++++++++++++++ lite/backends/intelfpga/lldrv/utils.cpp | 72 +++++++ lite/backends/intelfpga/lldrv/utils.h | 33 +++ lite/backends/intelfpga/target_wrapper.cpp | 38 ++++ lite/backends/intelfpga/target_wrapper.h | 60 ++++++ lite/core/CMakeLists.txt | 1 + lite/core/context.h | 26 +++ lite/kernels/CMakeLists.txt | 1 + lite/kernels/intelfpga/CMakeLists.txt | 9 + lite/kernels/intelfpga/conv_compute.cc | 99 +++++++++ lite/kernels/intelfpga/conv_compute.h | 55 +++++ lite/kernels/intelfpga/conv_depthwise.cc | 128 ++++++++++++ lite/kernels/intelfpga/conv_depthwise.h | 67 ++++++ lite/kernels/intelfpga/conv_gemmlike.cc | 185 +++++++++++++++++ lite/kernels/intelfpga/conv_gemmlike.h | 112 ++++++++++ 23 files changed, 1447 insertions(+), 8 deletions(-) create mode 100644 docs/demo_guides/intel_fpga.md create mode 100644 lite/backends/intelfpga/CMakeLists.txt create mode 100644 lite/backends/intelfpga/lldrv/intelfpgadrv.cpp create mode 100644 lite/backends/intelfpga/lldrv/intelfpgadrv.h create mode 100644 lite/backends/intelfpga/lldrv/utils.cpp create mode 100644 lite/backends/intelfpga/lldrv/utils.h create mode 100644 lite/backends/intelfpga/target_wrapper.cpp create mode 100644 lite/backends/intelfpga/target_wrapper.h create mode 100755 lite/kernels/intelfpga/CMakeLists.txt create mode 100644 lite/kernels/intelfpga/conv_compute.cc create mode 100644 lite/kernels/intelfpga/conv_compute.h create mode 100644 lite/kernels/intelfpga/conv_depthwise.cc create mode 100644 lite/kernels/intelfpga/conv_depthwise.h create mode 100644 lite/kernels/intelfpga/conv_gemmlike.cc create mode 100644 lite/kernels/intelfpga/conv_gemmlike.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c4e12f0f25..9dd5a87d7ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) +lite_option(LITE_WITH_INTELFPGA "Enable IntelFPGA support in lite" OFF) lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index a8065c5b0dc..d1467704ac9 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -173,6 +173,10 @@ if (LITE_WITH_FPGA) add_definitions("-DLITE_WITH_FPGA") endif() +if (LITE_WITH_INTELFPGA) +add_definitions("-DLITE_WITH_INTELFPGA") +endif() + if (LITE_WITH_BM) add_definitions("-DLITE_WITH_BM") endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 2be01753cce..b5f115ca973 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -81,6 +81,12 @@ function (lite_deps TARGET) set(deps ${deps} ${var}) endforeach(var) endif() + + if (LITE_WITH_INTELFPGA) + foreach(var ${lite_deps_INTELFPGA_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() if (LITE_WITH_NPU) foreach(var ${lite_deps_NPU_DEPS}) @@ -155,7 +161,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -171,6 +177,7 @@ function(lite_cc_library TARGET) ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} + INTELFPGA_DEPS ${args_INTELFPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -207,7 +214,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -219,6 +226,7 @@ function(lite_cc_binary TARGET) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} + INTELFPGA_DEPS ${args_INTELFPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -262,7 +270,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -282,6 +290,7 @@ function(lite_cc_test TARGET) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} + INTELFPGA_DEPS ${args_INTELFPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -318,6 +327,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels") set(x86_kernels CACHE INTERNAL "x86 kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") +set(intelfpga_kernels CACHE INTERNAL "intelfpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(apu_kernels CACHE INTERNAL "apu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") @@ -346,7 +356,7 @@ endif() function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -431,6 +441,15 @@ function(add_kernel TARGET device level) endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "INTELFPGA") + if (NOT LITE_WITH_INTELFPGA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(intelfpga_kernels "${intelfpga_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "BM") if (NOT LITE_WITH_BM) foreach(src ${args_SRCS}) @@ -514,6 +533,7 @@ function(add_kernel TARGET device level) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} + INTELFPGA_DEPS ${args_INTELFPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -540,7 +560,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -572,6 +592,7 @@ function(add_operator TARGET level) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} + INTELFPGA_DEPS ${args_INTELFPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md new file mode 100644 index 00000000000..b875ff80e5b --- /dev/null +++ b/docs/demo_guides/intel_fpga.md @@ -0,0 +1,107 @@ +# PaddleLite使用IntelFPGA预测部署 + +Paddle Lite支持基于arm的IntelFPGA c5的模型预测,提供armv7hf的交叉编译 + +PaddleLite通过调用底层驱动实现对FPGA硬件的调度,以及对应的API接口。 + +## Lite实现IntelFPGA简介 + +Lite支持IntelFPGA作为后端硬件进行模型推理,其主要特性如下: + +- Lite中IntelFPGA的kernel均以FP32、NCHW的格式作为输入输出格式 + +- 对于IntelFPGA暂不支持的kernel,均会切回ARM端运行,实现ARM+FPGA混合布署运行 + +## 支持芯片 +- [Cyclone V](https://www.intel.cn/content/dam/altera-www/global/en_US/pdfs/literature/hb/cyclone-v/cv_51002.pdf) + +### 已支持(或部分支持)的Paddle算子 + +- relu/relu6/leakyrelu +- conv2d +- depthwise_conv2d + +### 已支持的Paddle模型 + +- [SSD_MobileNet_V1](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_coco_pretrained.tar) + +## 编译 + +需要提前准备带有IntelFPGAdrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码 + +CMAKE编译选项: + +- 设置`LITE_WITH_INTELFPGA=ON`和`LITE_WITH_ARM=ON` + +其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。 + +示例如下: +```shell + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_OPENMP=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=OFF \ + -DLITE_WITH_INTELFPGA=ON \ + -DARM_TARGET_OS=armlinux + make publish_inference -j2 +``` +Lite提供FPGA编译脚本,位于lite/tools/build_intel_fpga.sh full_publish,在Lite根目录执行该脚本即可编译 + +## 运行示例 + +- **运行文件准备** + +下面以SSD模型为例,介绍如何使用C5MB/C5TB开发板实现模型运行 + +```bash +#打开串口调试工具,如Putty或SecureCRT,选择对应的调试串口,并设置串口属性, +#波特率:115200,数据位:8,停止位:1,奇偶校验:无[主机上执行] +#上电C5MB开发板,并在串口调试工具中登录 +awcloud login: root +Password: #密码:Awcloud@123 +#进入/opt目录[开发板执行] +cd /opt +#在运行模型前需要加载FPGA驱动[开发板执行] +insmod driver/IntelFPGAdrv.ko +``` + +- **使用IntelFPGA进行模型预测** + +```bash +#以下命令均在开发板上运行,在开发板上已经部署了对应的输入图片,模型,驱动程序,执行程序等 +#运行SSD测试程序,输入图片为/opt/images/dog.jpg,输出图片为/opt/dog_result.jpg +./run_ssd.sh +``` + +## 如何在Code中使用 + +在Lite中使用IntelFPGA与ARM相似,具体的区别如下: + +- 由于IntelFPGA运行模式为FP32精度、NCHW布局,所以需要修改相应的`valid_place` + +代码示例: +```cpp +lite::Predictor predictor; +std::vector valid_places( + {Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)},Place{TARGET(kARM)}); + +predictor.Build(model_dir, "", "", valid_places); + +auto* input_tensor = predictor.GetInput(0); +input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); +auto* data = input_tensor->mutable_data(); +auto item_size = input_tensor->dims().production(); +//假设设置输入数据全为1 +for (int i = 0; i < item_size; i++) { + data[i] = 1; +} + +predictor.Run(); +auto* out = predictor.GetOutput(0); +``` diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index ab71bd44b41..73a921b4b20 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -177,7 +177,10 @@ if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) set(cxx_api_deps ${cxx_api_deps} ${fpga_deps}) endif() - +if(LITE_WITH_INTELFPGA) + set(light_api_deps ${light_api_deps} ${intelfpga_deps}) + set(cxx_api_deps ${cxx_api_deps} ${intelfpga_deps}) +endif() if(LITE_WITH_BM) set(light_api_deps ${light_api_deps} ${bm_deps}) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) @@ -209,6 +212,7 @@ list(LENGTH apu_kernels num_apu_kernels) list(LENGTH xpu_kernels num_xpu_kernels) list(LENGTH rknpu_kernels num_rknpu_kernels) list(LENGTH fpga_kernels num_fpga_kernels) +list(LENGTH intelfpga_kernels num_intelfpga_kernels) list(LENGTH bm_kernels num_bm_kernels) list(LENGTH mlu_kernels num_mlu_kernels) list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels) @@ -225,6 +229,7 @@ message(STATUS "Collected ${num_apu_kernels} APU kernels") message(STATUS "Collected ${num_xpu_kernels} XPU kernels") message(STATUS "Collected ${num_rknpu_kernels} RKNPU kernels") message(STATUS "Collected ${num_fpga_kernels} FPGA kernels") +message(STATUS "Collected ${num_intelfpga_kernels} INTELFPGA kernels") message(STATUS "Collected ${num_bm_kernels} BM kernels") message(STATUS "Collected ${num_mlu_kernels} MLU kernels") message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels") @@ -249,6 +254,7 @@ if (NOT LITE_ON_TINY_PUBLISH) IMAGINATION_NNA_DEPS ${imagination_nna_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) endif() @@ -272,6 +278,7 @@ lite_cc_library(light_api SRCS light_api.cc RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} BM_DEPS ${bm_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} MLU_DEPS ${mlu_kernels} @@ -296,6 +303,7 @@ if(WITH_TESTING) RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} @@ -352,7 +360,7 @@ if(WITH_TESTING) endif() if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) - set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels}) + set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intelfpga_kernels}) lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc DEPS ${lite_model_test_DEPS} @@ -451,6 +459,7 @@ if (NOT LITE_ON_TINY_PUBLISH) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} BM_DEPS ${bm_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -470,6 +479,7 @@ if(NOT WITH_COVERAGE) DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -480,6 +490,7 @@ if(NOT WITH_COVERAGE) X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} @@ -524,6 +535,7 @@ if(NOT WITH_COVERAGE) CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} @@ -549,6 +561,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -566,6 +579,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -583,6 +597,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -599,6 +614,7 @@ if(NOT IOS) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -617,6 +633,7 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -631,6 +648,7 @@ if(NOT IOS) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + INTELFPGA_DEPS ${intelfpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 0ebf133f1c5..848bf47fc3f 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(apu) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) +add_subdirectory(intelfpga) diff --git a/lite/backends/intelfpga/CMakeLists.txt b/lite/backends/intelfpga/CMakeLists.txt new file mode 100644 index 00000000000..1ee8eccae05 --- /dev/null +++ b/lite/backends/intelfpga/CMakeLists.txt @@ -0,0 +1,23 @@ +if (NOT LITE_WITH_INTELFPGA) + return() +endif() + +set(LITE_INTELFPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga") +set(LITE_INTELFPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga/lldrv") + +message("intelfpga_path ${LITE_INTELFPGA_PATH}") +file(GLOB INTELFPGA_CPP "${LITE_INTELFPGA_PATH}/*.cpp") +file(GLOB LLDRV_CPP "${LITE_INTELFPGA_LLDRV_PATH}/*.cpp") +message("intelfpga cpp: ${INTELFPGA_CPP}") +set(INTELFPGA_ALL_CPP "") +FOREACH(FILE_PATH ${LLDRV_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND INTELFPGA_ALL_CPP lldrv/${FILE_NAME}) +ENDFOREACH(FILE_PATH) +FOREACH(FILE_PATH ${INTELFPGA_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND INTELFPGA_ALL_CPP ${FILE_NAME}) +ENDFOREACH(FILE_PATH) +message("intelfpga src: ${INTELFPGA_ALL_CPP}") +cc_library(kernel_intelfpga SRCS ${INTELFPGA_ALL_CPP}) +cc_library(intelfpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intelfpga) diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp b/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp new file mode 100644 index 00000000000..55e4bf92f0d --- /dev/null +++ b/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp @@ -0,0 +1,192 @@ +/* Copyright (c) 2020 AWCloud. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" + +namespace paddle { +namespace lite { +namespace intelfpga { + +/// FD of intelfpga +static int intelfpga_fd = -1; + +/// Memory blocks +static struct intelfpga_memblk_s mb, ms, mi, mk, mo; + +int intelfpga_open() { + if (intelfpga_fd < 0) { + intelfpga_fd = open("/dev/intelfpgadrv0", O_RDWR); + if (intelfpga_fd < 0) { + return -1; + } + memset(&mb, 0, sizeof(mb)); + memset(&ms, 0, sizeof(ms)); + memset(&mi, 0, sizeof(mi)); + memset(&mk, 0, sizeof(mk)); + memset(&mo, 0, sizeof(mo)); + } + + return 0; +} + +void intelfpga_close() { + if (intelfpga_fd < 0) return; + + if (mb.addr) { + free(mb.addr); + } + if (ms.addr) { + free(ms.addr); + } + if (mi.addr) { + free(mi.addr); + } + if (mk.addr) { + free(mk.addr); + } + if (mo.addr) { + free(mo.addr); + } + close(intelfpga_fd); + intelfpga_fd = -1; +} + +/// memory management; +void* intelfpga_malloc(size_t size) { return malloc(size); } + +void intelfpga_free(void* ptr) { free(ptr); } + +void* intelfpga_mbias(size_t size) { + if (mb.addr) { + if (mb.size >= size) { + return mb.addr; + } + free(mb.addr); + } + mb.addr = malloc(size); + if (mb.addr) { + mb.size = size; + } + return mb.addr; +} + +void* intelfpga_mscale(size_t size) { + if (ms.addr) { + if (ms.size >= size) { + return ms.addr; + } + free(ms.addr); + } + ms.addr = malloc(size); + if (ms.addr) { + ms.size = size; + } + + return ms.addr; +} + +void* intelfpga_minput(size_t size) { + if (mi.addr) { + if (mi.size >= size) { + return mi.addr; + } + free(mi.addr); + } + mi.addr = malloc(size); + if (mi.addr) { + mi.size = size; + } + + return mi.addr; +} + +void* intelfpga_mkernel(size_t size) { + if (mk.addr) { + if (mk.size >= size) { + return mk.addr; + } + free(mk.addr); + } + mk.addr = malloc(size); + if (mk.addr) { + mk.size = size; + } + + return mk.addr; +} + +void* intelfpga_moutput(size_t size) { + if (mo.addr) { + if (mo.size >= size) { + return mo.addr; + } + free(mo.addr); + } + mo.addr = malloc(size); + if (mo.addr) { + mo.size = size; + } + + return mo.addr; +} + +void intelfpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); } + +int intelfpga_info(struct intelfpga_info_s* args) { + int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_INFO); + + if (intelfpga_open()) return -1; + + return ioctl(intelfpga_fd, cmd, args); +} + +int intelfpga_conv(struct intelfpga_conv_s* args) { + int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_CONV); + + if (intelfpga_open()) return -1; + + return ioctl(intelfpga_fd, cmd, args); +} + +int intelfpga_pooling(struct intelfpga_pool_s* args) { + int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_POOL); + + if (intelfpga_open()) return -1; + + return ioctl(intelfpga_fd, cmd, args); +} + +int intelfpga_fullconnect(struct intelfpga_fcon_s* args) { + int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_FCON); + + if (intelfpga_open()) return -1; + + return ioctl(intelfpga_fd, cmd, args); +} + +} // namespace intelfpga +} // namespace lite +} // namespace paddle diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.h b/lite/backends/intelfpga/lldrv/intelfpgadrv.h new file mode 100644 index 00000000000..f35c343e030 --- /dev/null +++ b/lite/backends/intelfpga/lldrv/intelfpgadrv.h @@ -0,0 +1,186 @@ +/* Copyright (c) 2020 AWCloud. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _LLDRV_INTELFPGA_H_ +#define _LLDRV_INTELFPGA_H_ + +#pragma once + +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace intelfpga { + +// Activation type +enum intelfpga_act_e { + ACT_NONE = 0, + ACT_RELU = 1, +}; + +// Device information +struct intelfpga_info_s { + uint32_t ver; // Version, 00.00.0000 +}; + +struct intelfpga_reset_s { + uint32_t val; // reset command, N/A +}; + +// Memory copy +struct intelfpga_mcopy_s { + void* src; // source address + void* dst; // destination adddress + size_t size; // size in bytes +}; + +// Memory block +struct intelfpga_memblk_s { + void* addr; // base address + size_t size; // size in bytes +}; + +// Kernel +struct intelfpga_kernel_s { + uint32_t kw; // width + uint32_t kh; // height + uint32_t ws; // width stride(s) + uint32_t hs; // height stride(s) +}; + +// Input parameters, nchw +struct intelfpga_input_s { + uint32_t in; // nbr of batch {1} + uint32_t ic; // nbr of channels {1} + uint32_t iw; // width + uint32_t ih; // height + uint32_t pl; // padding x in bytes {0} + uint32_t pr; // padding x in bytes {0} + uint32_t pt; // padding y in bytes {0} + uint32_t pb; // padding y in bytes {0} + uint32_t dx; // dilation for x {1} + uint32_t dy; // dilation for y {1} +}; + +// Output parameters, nchw +struct intelfpga_output_s { + uint32_t on; // nbr of batch {1} + uint32_t oc; // nbr of channels {1} + uint32_t ow; // width + uint32_t oh; // height +}; + +// Basic convolution +struct intelfpga_conv_s { + uint32_t at; // activation type {0}, None=0, RELU=1 + uint32_t ng; // nbr of groups {1} + int8_t* ia; // input address, INT8[N,Ci,Hi,Wi] + int8_t* ka; // kernel address, INT32[Co,Ci,Hk,Wk] + int32_t* ba; // bias address, INT32[Co,1] + int32_t* oa; // output address, INT32[N,Co,Ho,Wo] + struct intelfpga_input_s i; // input + struct intelfpga_kernel_s k; // kernel + struct intelfpga_output_s o; // output +}; + +// Pooling convolution +struct intelfpga_pool_s { + uint32_t gp : 1; // global pooling {0} + uint32_t pm : 1; // pooling mode {0}, Max=0, AVG=1 + uint32_t cm : 1; // ceil mode {0}, ceil=0, floor=1 + uint32_t ex : 1; // exclusive {1}, if ignore padding in avg pooling + uint32_t reserved : 28; // reserved {0} + int32_t* ia; // input address, INT32[N,Ci,Hi,Wi] + int32_t* oa; // output address, INT32[N,Ci,Ho,Wo] + struct intelfpga_input_s i; // input + struct intelfpga_kernel_s k; // kernel + struct intelfpga_output_s o; // output +}; + +// Full connection +struct intelfpga_fcon_s { + uint32_t at; // activation type {0}, None=0, RELU=1 + int8_t* ia; // input address, INT8[M,K] + int8_t* ka; // kernel address, INT8[K,N] + int32_t* ba; // bias address, INT32[M,N] + int32_t* oa; // output address, INT32[M,N] = ia[M,K] * wa[K,N] + ba[M,N] + int m, n, k; // dims +}; + +// Regisger access +struct intelfpga_creg_s { + uint32_t addr; + uint32_t data; +}; + +#define INTELFPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4) + +/* Ioctls */ +#define INTELFPGA_IOCTL_MAKE(cmd) (_IO(INTELFPGA_MAGIC_ID, cmd)) +#define INTELFPGA_IOCTL_GET(cmd) (_IOC_NR(cmd)) +#define INTELFPGA_IOCTL_VALID(cmd) \ + ((_IOC_TYPE(cmd) == INTELFPGA_MAGIC_ID) ? 1 : 0) + +#define INTELFPGA_CMD_INFO 0x00 // struct intelfpga_info_s +#define INTELFPGA_CMD_RESET 0x01 // struct intelfpga_reset_s + +#define INTELFPGA_CMD_MCOPY 0x10 // struct intelfpga_mcopy_s +#define INTELFPGA_CMD_INVAL 0x11 // struct intelfpga_cache_s +#define INTELFPGA_CMD_FLUSH 0x12 // struct intelfpga_cache_s + +#define INTELFPGA_CMD_CONV 0x20 // struct intelfpga_conv_s +#define INTELFPGA_CMD_POOL 0x21 // struct intelfpga_pool_s +#define INTELFPGA_CMD_FCON 0x22 // struct intelfpga_fcon_s + +#define INTELFPGA_CMD_REGRD 0xC0 // struct intelfpga_register_s +#define INTELFPGA_CMD_REGWR 0xC1 // struct intelfpga_register_s + +//--------------------------------------------------------------------------- + +// device open/close +int intelfpga_open(); +void intelfpga_close(); + +void intelfpga_reset(struct intelfpga_reset_s* args); + +// memory management +void* intelfpga_malloc(size_t size); +void intelfpga_free(void* ptr); + +void* intelfpga_mbias(size_t size); +void* intelfpga_mscale(size_t size); +void* intelfpga_minput(size_t size); +void* intelfpga_mkernel(size_t size); +void* intelfpga_moutput(size_t size); + +void intelfpga_copy(void* dst, void* src, int size); +int intelfpga_flush(void* addr, size_t size); +int intelfpga_invalidate(void* addr, size_t size); + +// device information +int intelfpga_info(struct intelfpga_info_s* args); + +// convolution process +int intelfpga_conv(struct intelfpga_conv_s* args); +int intelfpga_pooling(struct intelfpga_pool_s* args); +int intelfpga_fullconnect(struct intelfpga_fcon_s* args); + +} // namespace intelfpga +} // namespace lite +} // namespace paddle + +#endif // _LLDRV_INTELFPGA_H_ diff --git a/lite/backends/intelfpga/lldrv/utils.cpp b/lite/backends/intelfpga/lldrv/utils.cpp new file mode 100644 index 00000000000..0ad6fb9836d --- /dev/null +++ b/lite/backends/intelfpga/lldrv/utils.cpp @@ -0,0 +1,72 @@ +/* Copyright (c) 2020 AWCloud. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "lite/backends/intelfpga/lldrv/utils.h" + +namespace paddle { +namespace lite { +namespace intelfpga { + +float find_max(const float* data, int size) { + float max = 0.0; + + for (size_t i = 0; i < size; ++i) { + float value = data[i]; + float abs = value > 0.0 ? value : -value; + + max = std::max(max, abs); + } + + return max; +} + +void quantize_s8(const float* src, int8_t* dst, int size, float factor) { + float fdata; + + for (size_t i = 0; i < size; i++) { + fdata = src[i] * factor; + + if (fdata < 0.0) { + fdata -= 0.5; + } else { + fdata += 0.5; + } + + dst[i] = (int8_t)fdata; + } +} + +void quantize_s32(const float* src, int32_t* dst, int size, float factor) { + float fdata; + + for (size_t i = 0; i < size; i++) { + fdata = src[i] * factor; + + if (fdata < 0.0) { + fdata -= 0.5; + } else { + fdata += 0.5; + } + + dst[i] = (int32_t)fdata; + } +} +} // namespace intelfpga +} // namespace lite +} // namespace paddle diff --git a/lite/backends/intelfpga/lldrv/utils.h b/lite/backends/intelfpga/lldrv/utils.h new file mode 100644 index 00000000000..d3883cc3e07 --- /dev/null +++ b/lite/backends/intelfpga/lldrv/utils.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2020 AWCloud. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace intelfpga { + +float find_max(const float* data, int size); + +void quantize_s8(const float* src, int8_t* dst, int size, float factor); +void quantize_s32(const float* src, int32_t* dst, int size, float factor); + +} // namespace intelfpga +} // namespace lite +} // namespace paddle diff --git a/lite/backends/intelfpga/target_wrapper.cpp b/lite/backends/intelfpga/target_wrapper.cpp new file mode 100644 index 00000000000..c2de3ff6bfb --- /dev/null +++ b/lite/backends/intelfpga/target_wrapper.cpp @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/intelfpga/target_wrapper.h" +#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { + +void* TargetWrapper::Malloc(size_t size) { + return intelfpga::intelfpga_malloc(size); +} + +void TargetWrapper::Free(void* ptr) { + intelfpga::intelfpga_free(ptr); +} + +void TargetWrapper::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + memcpy(dst, src, size); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/intelfpga/target_wrapper.h b/lite/backends/intelfpga/target_wrapper.h new file mode 100644 index 00000000000..ee60348f10f --- /dev/null +++ b/lite/backends/intelfpga/target_wrapper.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +template <> +class TargetWrapper { + public: + using stream_t = int; + using event_t = int; + + static size_t num_devices() { return 0; } + static size_t maximum_stream() { return 0; } + + static void CreateStream(stream_t* stream) {} + static void DestroyStream(const stream_t& stream) {} + + static void CreateEvent(event_t* event) {} + static void DestroyEvent(const event_t& event) {} + + static void RecordEvent(const event_t& event) {} + static void SyncEvent(const event_t& event) {} + + static void StreamSync(const stream_t& stream) {} + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); + static void MemcpyAsync(void* dst, + const void* src, + size_t size, + IoDirection dir, + const stream_t& stream) { + MemcpySync(dst, src, size, dir); + } +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 9e03ca693b7..18ed6d7f9a8 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -8,6 +8,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper + INTELFPGA_DEPS intelfpga_target_wrapper BM_DEPS target_wrapper_bm MLU_DEPS target_wrapper_mlu) diff --git a/lite/core/context.h b/lite/core/context.h index dca559f06ae..e8789d16ea7 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -65,6 +65,7 @@ using MLUContext = Context; using RKNPUContext = Context; using HuaweiAscendNPUContext = Context; using ImaginationNNAContext = Context; +using IntelFPGAContext = Context; template <> class Context { @@ -327,6 +328,21 @@ class Context { }; #endif +#ifdef LITE_WITH_INTELFPGA +// TODO(xbeu): add needed implementation to context +template <> +class Context { + public: + void InitOnce() {} + + IntelFPGAContext& operator=(const IntelFPGAContext& ctx) {} + + void CopySharedTo(IntelFPGAContext* ctx) {} + + std::string name() const { return "IntelFPGAContext"; } +}; +#endif + #ifdef LITE_WITH_MLU template <> class Context { @@ -547,6 +563,13 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_INTELFPGA + case TARGET(kIntelFPGA): + kernel_contexts_[TargetType::kIntelFPGA] + .As() + .CopySharedTo(&ctx->As()); + break; +#endif #ifdef LITE_WITH_BM case TARGET(kBM): kernel_contexts_[TargetType::kBM].As().CopySharedTo( @@ -602,6 +625,9 @@ class ContextScheduler { #ifdef LITE_WITH_FPGA InitContext(); #endif +#ifdef LITE_WITH_INTELFPGA + InitContext(); +#endif #ifdef LITE_WITH_NPU InitContext(); #endif diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 79cce9a0243..52649cdc520 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -16,3 +16,4 @@ add_subdirectory(bm) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) +add_subdirectory(intelfpga) diff --git a/lite/kernels/intelfpga/CMakeLists.txt b/lite/kernels/intelfpga/CMakeLists.txt new file mode 100755 index 00000000000..4f2fbe6d5d2 --- /dev/null +++ b/lite/kernels/intelfpga/CMakeLists.txt @@ -0,0 +1,9 @@ +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTELFPGA)) + return() +endif() + +set(intelfpga_deps intelfpga_target_wrapper kernel_intelfpga) + +add_kernel(conv_depthwise_intelfpga INTELFPGA basic SRCS conv_depthwise.cc DEPS ${intelfpga_deps}) +add_kernel(conv_gemmlike_intelfpga INTELFPGA basic SRCS conv_gemmlike.cc DEPS ${intelfpga_deps}) +add_kernel(conv_compute_intelfpga INTELFPGA basic SRCS conv_compute.cc DEPS ${intelfpga_deps} conv_depthwise_intelfpga conv_gemmlike_intelfpga) diff --git a/lite/kernels/intelfpga/conv_compute.cc b/lite/kernels/intelfpga/conv_compute.cc new file mode 100644 index 00000000000..e0c75367bd2 --- /dev/null +++ b/lite/kernels/intelfpga/conv_compute.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/intelfpga/conv_compute.h" +#include +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/kernels/intelfpga/conv_depthwise.h" +#include "lite/kernels/intelfpga/conv_gemmlike.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { +#define PARAM_INIT \ + auto& param = this->Param(); \ + auto w_dims = param.filter->dims(); \ + auto paddings = *param.paddings; \ + auto dilations = *param.dilations; \ + int ic = w_dims[1] * param.groups; \ + int oc = w_dims[0]; \ + int kh = w_dims[2]; \ + int kw = w_dims[3]; \ + int pad_h = paddings[0]; \ + int pad_w = paddings[2]; \ + int stride = param.strides[0]; \ + int sh = param.strides[1]; \ + int sw = param.strides[0]; \ + int chin = param.x->dims()[1]; \ + int hin = param.x->dims()[2]; \ + int win = param.x->dims()[3]; \ + int chout = param.output->dims()[1]; \ + int hout = param.output->dims()[2]; \ + int wout = param.output->dims()[3]; \ + bool pads_equal = \ + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); \ + bool pads_all_equal = (pads_equal && pad_h == pad_w); \ + bool ks_equal = (sw == sh) && (kw == kh); \ + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); \ + bool kps_equal = (pad_h == pad_w) && ks_equal; \ + bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2); \ + bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2); \ + bool flag_dw = flag_dw_3x3 || flag_dw_5x5; + +template <> +void ConvCompute::PrepareForRun() { + PARAM_INIT + /// select conv impl + if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) { + impl_ = new DepthwiseConv; + // VLOG(3) << "invoking dw conv"; + } else { + impl_ = new GemmLikeConv; + // VLOG(3) << "invoking gemm like conv"; + } + if (!arm_cxt_) { + arm_cxt_ = ContextScheduler::Global().NewContext(TargetType::kARM); + } + impl_->SetContext(std::move(arm_cxt_)); + impl_->SetParam(param); + impl_->PrepareForRun(); + is_first_epoch_ = false; +} + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::intelfpga::ConvCompute + ConvFp32; + +REGISTER_LITE_KERNEL(conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindPaddleOpVersion("conv2d", 1) + .Finalize(); + +REGISTER_LITE_KERNEL(depthwise_conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindPaddleOpVersion("depthwise_conv2d", 1) + .Finalize(); diff --git a/lite/kernels/intelfpga/conv_compute.h b/lite/kernels/intelfpga/conv_compute.h new file mode 100644 index 00000000000..a9fd135e431 --- /dev/null +++ b/lite/kernels/intelfpga/conv_compute.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { + +template +class ConvCompute : public KernelLite { + public: + virtual void PrepareForRun(); + + virtual void ReInitWhenNeeded() { + CHECK(impl_); + impl_->ReInitWhenNeeded(); + } + + virtual void Run() { + CHECK(impl_); + impl_->Run(); + } + + ~ConvCompute() { + if (impl_ != nullptr) { + delete impl_; + } + } + + private: + using param_t = operators::ConvParam; + std::unique_ptr arm_cxt_{nullptr}; + KernelLite* impl_{nullptr}; +}; + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/intelfpga/conv_depthwise.cc b/lite/kernels/intelfpga/conv_depthwise.cc new file mode 100644 index 00000000000..80cab07e848 --- /dev/null +++ b/lite/kernels/intelfpga/conv_depthwise.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/intelfpga/conv_depthwise.h" +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { + +template <> +void DepthwiseConv::ReInitWhenNeeded() {} + +template <> +void DepthwiseConv::PrepareForRun() { + auto& param = this->Param(); + CHECK(this->ctx_); + auto& ctx = this->ctx_->template As(); + auto w_dims = param.filter->dims(); + auto kw = w_dims[3]; + auto channel = w_dims[0]; + auto hin = param.x->dims()[2]; + auto win = param.x->dims()[3]; + auto paddings = *param.paddings; + // select dw conv kernel + if (kw == 3) { + bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2)); + if (pads_less && paddings[0] == paddings[2] && + (paddings[0] == 0 || paddings[0] == 1)) { + flag_trans_weights_ = false; + } else { + // trans weights + constexpr int cblock = 4; + auto oc = w_dims[0]; + auto kh = w_dims[2]; + auto cround = ROUNDUP(oc, cblock); + weights_.Resize({cround, 1, kh, kw}); + auto w_data = weights_.mutable_data(); + auto w_data_in = param.filter->data(); + lite::arm::math::conv_trans_weights_numc( + w_data_in, w_data, oc, 1, cblock, kh * kw); + flag_trans_weights_ = true; + } + impl_ = lite::arm::math::conv_depthwise_3x3_fp32; + } else if (kw == 5) { + auto strides = param.strides; + if ((strides[0] == 1 && strides[1] == 1) || + (strides[0] == 2 && strides[1] == 2)) { + // trans weights + constexpr int cblock = 4; + auto oc = w_dims[0]; + auto kh = w_dims[2]; + auto cround = ROUNDUP(oc, cblock); + weights_.Resize({cround, 1, kh, kw}); + auto w_data = weights_.mutable_data(); + auto w_data_in = param.filter->data(); + lite::arm::math::conv_trans_weights_numc( + w_data_in, w_data, oc, 1, cblock, kh * kw); + flag_trans_weights_ = true; + impl_ = lite::arm::math::conv_depthwise_5x5_fp32; + } else { + LOG(FATAL) + << "5x5 depthwise conv only support stride == 1 or stride == 2"; + } + } else { + LOG(FATAL) << "this type dw conv not impl"; + } +} + +template <> +void DepthwiseConv::Run() { + auto& param = this->Param(); + CHECK(this->ctx_); + auto& ctx = this->ctx_->template As(); + const auto* i_data = param.x->data(); + const auto* w_data = flag_trans_weights_ ? weights_.data() + : param.filter->data(); + const auto* b_data = param.bias ? param.bias->data() : nullptr; + if (flag_trans_bias_) { + b_data = bias_.data(); + } + auto* o_data = param.output->mutable_data(); + + auto x_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + + int iw = x_dims[3]; // nchw + int ih = x_dims[2]; + int ic = x_dims[1]; + int bs = x_dims[0]; + int oh = o_dims[2]; + int ow = o_dims[3]; + int oc = o_dims[1]; + + impl_(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + param, + &ctx, + w_scale_.data()); +} + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/intelfpga/conv_depthwise.h b/lite/kernels/intelfpga/conv_depthwise.h new file mode 100644 index 00000000000..3f9bf657e02 --- /dev/null +++ b/lite/kernels/intelfpga/conv_depthwise.h @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { + +template +class DepthwiseConv : public KernelLite { + public: + typedef void (*conv_dw_impl)(const void* din, + void* dout, + int num, + int ch_out, + int h_out, + int w_out, + int ch_in, + int h_in, + int w_in, + const void* weights, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx, + const float* scale); + DepthwiseConv() = default; + ~DepthwiseConv() {} + virtual void PrepareForRun(); + virtual void ReInitWhenNeeded(); + virtual void Run(); + + private: + using param_t = operators::ConvParam; + Tensor weights_; + Tensor bias_; + DDim last_shape_; + bool flag_trans_weights_{false}; + bool flag_trans_bias_{false}; + conv_dw_impl impl_{nullptr}; + std::vector w_scale_; +}; + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/intelfpga/conv_gemmlike.cc b/lite/kernels/intelfpga/conv_gemmlike.cc new file mode 100644 index 00000000000..2131d2c032f --- /dev/null +++ b/lite/kernels/intelfpga/conv_gemmlike.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/intelfpga/conv_gemmlike.h" +#include +#include "lite/backends/arm/math/gemm_prepacked_int8.h" +#include "lite/backends/arm/math/packed_sgemm.h" +#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" +#include "lite/backends/intelfpga/lldrv/utils.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { + +template <> +void GemmLikeConv::PrepareForRun() { + ReInitWhenNeeded(); +} + +template <> +void GemmLikeConv::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + ctx.ExtendWorkspace(workspace_size_); + auto weights = param.filter->data(); + if (flag_trans_weights_) { + weights = weights_.data(); + } + const float* b_data = param.bias ? param.bias->data() : nullptr; + if (flag_trans_bias_) { + b_data = bias_.data(); + } + auto i_data = param.x->data(); + auto w_data = param.filter->data(); + auto o_data = param.output->mutable_data(); + auto i_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + + int iw, ih, ic, bs, ow, oh, oc; + float alpha; + + iw = i_dims[3]; // nchw + ih = i_dims[2]; + ic = i_dims[1]; + bs = i_dims[0]; + oh = o_dims[2]; + ow = o_dims[3]; + oc = o_dims[1]; + + int kh = w_dims[2]; + int kw = w_dims[3]; + + if (kh > 1 && kw > 1) { + int i, j, il, kl, ol, l, m, n, k; + lite::intelfpga::intelfpga_conv_s conv; + + conv.at = static_cast(param.activation_param.active_type); + if (conv.at == 4) { + alpha = param.activation_param.Leaky_relu_alpha; + } + conv.ng = param.groups; + + conv.i.in = i_dims[0]; + conv.i.ic = i_dims[1]; + conv.i.ih = i_dims[2]; + conv.i.iw = i_dims[3]; + conv.i.pl = paddings[2]; // left + conv.i.pr = paddings[3]; // right + conv.i.pt = paddings[0]; // top + conv.i.pb = paddings[1]; // bottom + conv.i.dy = dilations[0]; + conv.i.dx = dilations[1]; + + conv.k.kh = w_dims[2]; + conv.k.kw = w_dims[3]; + conv.k.hs = param.strides[0]; + conv.k.ws = param.strides[1]; + + conv.o.on = o_dims[0]; + conv.o.oc = o_dims[1]; + conv.o.oh = o_dims[2]; + conv.o.ow = o_dims[3]; + + il = conv.i.in * conv.i.ic * conv.i.ih * conv.i.iw; + kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw; + ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow; + conv.ia = static_cast( + lite::intelfpga::intelfpga_minput(il * sizeof(int8_t))); + conv.ka = static_cast( + lite::intelfpga::intelfpga_mkernel(kl * sizeof(int8_t))); + conv.oa = static_cast( + lite::intelfpga::intelfpga_moutput(ol * sizeof(int32_t))); + if (conv.ia && conv.ka && conv.oa) { + float fd = lite::intelfpga::find_max(i_data, il); + float fw = lite::intelfpga::find_max(w_data, kl); + + fd = 127.0 / fd; + fw = 127.0 / fw; + + // y = 127.0 / fmax + // y = x * scale; + lite::intelfpga::quantize_s8(i_data, conv.ia, il, fd); + lite::intelfpga::quantize_s8(w_data, conv.ka, kl, fw); + + // perform conv2d + if (lite::intelfpga::intelfpga_conv(&conv)) { + std::cout << "intelfpga_conv error" << std::endl; + } + // Convert int32 back to fp32, [n,c,h,w] + // 1. y = x / scale + // 2. y = x + b + // 3. y = f(x) + int hw = conv.o.oh * conv.o.ow; + for (i = 0; i < conv.o.on; i++) { + for (j = 0; j < conv.o.oc; j++) { + m = i * conv.o.oc + j; + n = m * hw; + for (l = 0; l < hw; l++) { + k = n + l; + o_data[k] = static_cast(conv.oa[k] / fd / fw); + if (b_data) o_data[k] += b_data[j]; + if (conv.at == 1) { // relu + o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0; + } else if (conv.at == 2) { // relu6 + o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0; + o_data[k] = o_data[k] > 6.0 ? 6.0 : o_data[k]; + } else if (conv.at == 4) { // leakyRelu + if (o_data[k] < 0.0) o_data[k] = o_data[k] * alpha; + } + } + } + } + } + } else { + if (flag_1x1gemm_) { + lite::arm::math::conv1x1s1_gemm(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + weights, + b_data, + param, + &ctx); + } else { + lite::arm::math::conv_im2col_gemm(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + weights, + b_data, + param, + &ctx); + } + } +} + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/intelfpga/conv_gemmlike.h b/lite/kernels/intelfpga/conv_gemmlike.h new file mode 100644 index 00000000000..812271010c7 --- /dev/null +++ b/lite/kernels/intelfpga/conv_gemmlike.h @@ -0,0 +1,112 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace intelfpga { + +template +class GemmLikeConv : public KernelLite { + public: + GemmLikeConv() = default; + ~GemmLikeConv() {} + + virtual void ReInitWhenNeeded() { + auto& param = this->template Param(); + CHECK(this->ctx_); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + if (last_shape_ == x_dims) { + return; + } + + int iw = x_dims[3]; // nchw + int ih = x_dims[2]; + int ic = x_dims[1]; + int ow = o_dims[3]; + int oh = o_dims[2]; + int oc = o_dims[1]; + int kw = w_dims[3]; + int kh = w_dims[2]; + + auto paddings = *param.paddings; + auto dilations = *param.dilations; + + int sw = param.strides[1]; + int sh = param.strides[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; + + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + + int m = oc / param.groups; + int k = ic * kh * kw / param.groups; + int n = oh * ow; + + bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh); + bool ks_equal = (sw == sh) && (kw == kh); + //! select conv gemmlike kernel + if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) { + //! 1x1s1p0 gemmlike conv + flag_1x1gemm_ = true; + } else { + //! im2col gemmlike conv + flag_1x1gemm_ = false; + workspace_size_ = k * n * sizeof(float); + } + if (!flag_trans_weights_ && n > 1 && m > 1) { + lite::arm::math::trans_gemm_weights( + *(param.filter), weights_, param.groups, &ctx); + flag_trans_weights_ = true; + } else if (n == 1 || m == 1) { + flag_trans_weights_ = false; + } + last_shape_ = x_dims; + } + virtual void PrepareForRun(); + virtual void Run(); + + protected: + using param_t = operators::ConvParam; + DDim last_shape_; + std::vector w_scale_; + bool flag_1x1gemm_{true}; + bool flag_trans_weights_{false}; + bool flag_trans_bias_{false}; + Tensor weights_; + Tensor bias_; + int workspace_size_{0}; +}; + +} // namespace intelfpga +} // namespace kernels +} // namespace lite +} // namespace paddle From 99756c67660e5dbfefe3ac8f44999e6ea83a4e99 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 17 Mar 2021 08:04:49 +0000 Subject: [PATCH 02/19] test=develop --- lite/tools/build_intel_fpga.sh | 324 +++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100755 lite/tools/build_intel_fpga.sh diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh new file mode 100755 index 00000000000..ef647df315c --- /dev/null +++ b/lite/tools/build_intel_fpga.sh @@ -0,0 +1,324 @@ +#!/bin/bash +set -e + +##################################################################################################### +# 1. global variables, you can change them according to your requirements +##################################################################################################### +# armv7hf. +ARCH=armv7hf +# gcc or clang, default gcc. +TOOLCHAIN=gcc +# ON or OFF, default OFF. +WITH_EXTRA=ON +# controls whether to compile python lib, default is OFF. +WITH_PYTHON=OFF +PY_VERSION="" +# controls whether to compile cv functions into lib, default is OFF. +WITH_CV=OFF +# controls whether to print log information, default is ON. +WITH_LOG=OFF +# controls whether to throw the exception when error occurs, default is OFF +WITH_EXCEPTION=OFF +# options of striping lib according to input model. +WITH_STRIP=OFF +OPTMODEL_DIR="" +# options of compiling OPENCL lib. +WITH_OPENCL=OFF +# options of compiling intel fpga. +WITH_INTELFPGA=ON +# options of adding training ops +WITH_TRAIN=OFF +# num of threads used during compiling.. +readonly NUM_PROC=${LITE_BUILD_THREADS:-4} +##################################################################################################### + +##################################################################################################### +# 2. local variables, these variables should not be changed. +##################################################################################################### +# url that stores third-party zip file to accelerate third-paty lib installation +readonly THIRDPARTY_TAR=https://paddlelite-data.bj.bcebos.com/third_party_libs/third-party-ea5576.tar.gz +# absolute path of Paddle-Lite. +readonly workspace=$PWD/$(dirname $0)/../../ +# basic options for linux compiling. +readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_X86=OFF \ + -DARM_TARGET_OS=armlinux \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=OFF" +# mutable options for linux compiling. +function init_cmake_mutable_options { + cmake_mutable_options="-DARM_TARGET_ARCH_ABI=$ARCH \ + -DARM_TARGET_LANG=$TOOLCHAIN \ + -DLITE_BUILD_EXTRA=$WITH_EXTRA \ + -DLITE_WITH_PYTHON=$WITH_PYTHON \ + -DPY_VERSION=$PY_VERSION \ + -DLITE_WITH_CV=$WITH_CV \ + -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ + -DLITE_BUILD_TAILOR=$WITH_STRIP \ + -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ + -DLITE_WITH_OPENCL=$WITH_OPENCL \ + -DLITE_WITH_INTELFPGA=$WITH_INTELFPGA \ + -DLITE_WITH_TRAIN=$WITH_TRAIN" +} +##################################################################################################### + +#################################################################################################### +# 3. functions of prepare workspace before compiling +#################################################################################################### + +# 3.1 generate `__generated_code__.cc`, which is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + local root_dir=$1 + local build_dir=$2 + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code + mkdir -p ${GEN_CODE_PATH_PREFIX} + touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug + mkdir -p ${DEBUG_TOOL_PATH_PREFIX} + cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/ +} + +# 3.2 prepare source code of opencl lib +# here we bundle all cl files into a cc file to bundle all opencl kernels into a single lib +function prepare_opencl_source_code { + local root_dir=$1 + local build_dir=$2 + # in build directory + # Prepare opencl_kernels_source.cc file + GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl + rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc + OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel + mkdir -p ${GEN_CODE_PATH_OPENCL} + touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc + python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc +} + +# 3.3 prepare third_party libraries for compiling +# here we store third_party libraries into Paddle-Lite/third-party +function prepare_thirdparty { + if [ ! -d $workspace/third-party -o -f $workspace/third-party-ea5576.tar.gz ]; then + rm -rf $workspace/third-party + if [ ! -f $workspace/third-party-ea5576.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-ea5576.tar.gz + else + git submodule update --init --recursive + fi +} +#################################################################################################### + +#################################################################################################### +# 4. compiling functions +#################################################################################################### + +# 4.1 function of tiny_publish compiling +# here we only compile light_api lib +function make_tiny_publish_so { + is_tiny=${1:-ON} + if [ "$WITH_PYTHON" = "ON" -a "$is_tiny" = "ON" ]; then + echo "Warning: build full_publish to use python." + is_tiny=OFF + fi + if [ "$WITH_TRAIN" = "ON" -a "$is_tiny" = "ON" ]; then + echo "Warning: build full_publish to add training ops." + is_tiny=OFF + fi + if [ "$BUILD_TAILOR" = "ON" -a "$OPTMODEL_DIR" = "" ]; then + echo "Error: set OPTMODEL_DIR if BUILD_TAILOR is ON." + fi + + if [ "$is_tiny" = "OFF" ]; then + prepare_thirdparty + fi + + build_dir=$workspace/build.lite.linux.$ARCH.$TOOLCHAIN + if [ "${WITH_OPENCL}" = "ON" ]; then + build_dir=${build_dir}.opencl + fi + + if [ -d $build_dir ]; then + rm -rf $build_dir + fi + mkdir -p $build_dir + cd $build_dir + + prepare_workspace $workspace $build_dir + + if [ "${WITH_OPENCL}" = "ON" ]; then + prepare_opencl_source_code $workspace $build_dir + fi + if [ "${WITH_STRIP}" == "ON" ]; then + WITH_EXTRA=ON + fi + + init_cmake_mutable_options + cmake $workspace \ + ${CMAKE_COMMON_OPTIONS} \ + ${cmake_mutable_options} \ + -DLITE_ON_TINY_PUBLISH=$is_tiny + + if [ "${WITH_OPENCL}" = "ON" ]; then + make opencl_clhpp -j$NUM_PROC + fi + + make publish_inference -j$NUM_PROC + cd - > /dev/null +} +#################################################################################################### + +# 4.2 function of full_publish compiling +# here we compile both light_api lib and full_api lib +function make_full_publish_so { + make_tiny_publish_so OFF +} +#################################################################################################### + +function print_usage { + echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" + echo -e "| Methods of compiling Padddle-Lite Linux library: |" + echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" + echo -e "| compile linux library: (armv8, gcc) |" + echo -e "| ./lite/tools/build_linux.sh |" + echo -e "| print help information: |" + echo -e "| ./lite/tools/build_linux.sh help |" + echo -e "| |" + echo -e "| optional argument: |" + echo -e "| --arch: (armv8|armv7hf|armv7), default is armv8 |" + echo -e "| --toolchain: (gcc|clang), defalut is gcc |" + echo -e "| --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP), default is OFF |" + echo -e "| --with_python: (OFF|ON); controls whether to build python lib or whl, default is OFF |" + echo -e "| --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None |" + echo -e "| --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF |" + echo -e "| --with_log: (OFF|ON); controls whether to print log information, default is ON |" + echo -e "| --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF |" + echo -e "| |" + echo -e "| arguments of striping lib according to input model: |" + echo -e "| ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir |" + echo -e "| --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF |" + echo -e "| --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library |" + echo -e "| detailed information about striping lib: https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html |" + echo -e "| |" + echo -e "| arguments of opencl library compiling: |" + echo -e "| ./lite/tools/build_linux.sh --with_opencl=ON |" + echo -e "| --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF |" + echo -e "| |" + echo -e "| arguments of rockchip npu library compiling: |" + echo -e "| ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath |" + echo -e "| --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF |" + echo -e "| --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library |" + echo -e "| you can download rockchip NPU SDK from: https://github.com/airockchip/rknpu_ddk.git |" + echo -e "| detailed information about Paddle-Lite RKNPU: https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html |" + echo -e "| |" + echo -e "| arguments of baidu xpu library compiling: |" + echo -e "| ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath |" + echo -e "| --with_baidu_xpu: (OFF|ON); controls whether to compile lib for baidu_xpu, default is OFF |" + echo -e "| --baidu_xpu_sdk_root: (path to baidu_xpu DDK file) required when compiling baidu_xpu library |" + echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" + echo +} + +function main { + if [ -z "$1" ]; then + # compiling result contains light_api lib only, recommanded. + make_tiny_publish_so + exit 0 + fi + + # Parse command line. + for i in "$@"; do + case $i in + # armv8 or armv7hf or armv7, default armv8 + --arch=*) + ARCH="${i#*=}" + shift + ;; + # gcc or clang, default gcc + --toolchain=*) + TOOLCHAIN="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_extra=*) + WITH_EXTRA="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_python=*) + WITH_PYTHON="${i#*=}" + shift + ;; + # 2.7 or 3.5 or 3.7, default is None + --python_version=*) + PY_VERSION="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_cv=*) + WITH_CV="${i#*=}" + shift + ;; + # ON or OFF, default ON + --with_log=*) + WITH_LOG="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_exception=*) + WITH_EXCEPTION="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_strip=*) + BUILD_TAILOR="${i#*=}" + shift + ;; + # string, absolute path to optimized model dir + --opt_model_dir=*) + OPTMODEL_DIR="${i#*=}" + shift + ;; + # compiling lib which can operate on opencl and cpu. + --with_opencl=*) + WITH_OPENCL="${i#*=}" + shift + ;; + # compiling lib which can operate on intel fpga. + --with_intelfpga=*) + WITH_INTELFPGA="${i#*=}" + shift + ;; + # ON or OFF, default OFF + --with_train=*) + WITH_TRAIN="${i#*=}" + shift + ;; + # compiling result contains both light_api and cxx_api lib. + full_publish) + make_full_publish_so + exit 0 + ;; + # print help info + help) + print_usage + exit 0 + ;; + # unknown option + *) + echo "Error: unsupported argument \"${i#*=}\"" + print_usage + exit 1 + ;; + esac + done + # compiling result contains light_api lib only, recommanded. + make_tiny_publish_so +} + +main $@ From 680a34891c16429061cac6e24fdc3336023d1d39 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 17 Mar 2021 09:42:50 +0000 Subject: [PATCH 03/19] test=develop --- lite/api/paddle_place.cc | 9 ++++++--- lite/api/paddle_place.h | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index a04d632bdc4..d47f2a92a6f 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -82,7 +82,8 @@ const std::string& TargetToStr(TargetType target) { "rknpu", "apu", "huawei_ascend_npu", - "imagination_nna"}; + "imagination_nna", + "intelfpga"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -129,7 +130,8 @@ const std::string& TargetRepr(TargetType target) { "kRKNPU", "kAPU", "kHuaweiAscendNPU", - "kImaginationNNA"}; + "kImaginationNNA", + "kIntelFPGA"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -190,7 +192,8 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kRKNPU), TARGET(kFPGA), TARGET(kHuaweiAscendNPU), - TARGET(kImaginationNNA)}); + TARGET(kImaginationNNA), + TARGET(kIntelFPGA)}); if (target == TARGET(kAny)) { return valid_set; } diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index f0563f63006..62d82398744 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -59,7 +59,8 @@ enum class TargetType : int { kAPU = 13, kHuaweiAscendNPU = 14, kImaginationNNA = 15, - NUM = 16, // number of fields. + kIntelFPGA = 16, + NUM = 17, // number of fields. }; enum class PrecisionType : int { kUnk = 0, From a14cbfe803c195aa26e68b24cc74ba86c74a027e Mon Sep 17 00:00:00 2001 From: xbeu Date: Thu, 18 Mar 2021 01:58:32 +0000 Subject: [PATCH 04/19] test=develop --- CMakeLists.txt | 2 +- cmake/configure.cmake | 4 +- cmake/lite.cmake | 34 ++--- docs/demo_guides/intel_fpga.md | 12 +- lite/api/CMakeLists.txt | 38 +++--- lite/api/paddle_place.cc | 2 +- lite/backends/CMakeLists.txt | 2 +- lite/backends/intel_fpga/CMakeLists.txt | 23 ++++ .../lldrv/intelfpgadrv.cpp | 76 +++++------ .../lldrv/intelfpgadrv.h | 122 +++++++++--------- .../{intelfpga => intel_fpga}/lldrv/utils.cpp | 6 +- .../{intelfpga => intel_fpga}/lldrv/utils.h | 4 +- .../target_wrapper.cpp | 8 +- .../target_wrapper.h | 0 lite/backends/intelfpga/CMakeLists.txt | 23 ---- lite/core/CMakeLists.txt | 2 +- lite/core/context.h | 6 +- lite/kernels/CMakeLists.txt | 2 +- lite/kernels/intel_fpga/CMakeLists.txt | 9 ++ .../{intelfpga => intel_fpga}/conv_compute.cc | 14 +- .../{intelfpga => intel_fpga}/conv_compute.h | 4 +- .../conv_depthwise.cc | 6 +- .../conv_depthwise.h | 4 +- .../conv_gemmlike.cc | 30 ++--- .../{intelfpga => intel_fpga}/conv_gemmlike.h | 4 +- lite/kernels/intelfpga/CMakeLists.txt | 9 -- lite/tools/build_intel_fpga.sh | 8 +- 27 files changed, 227 insertions(+), 227 deletions(-) create mode 100644 lite/backends/intel_fpga/CMakeLists.txt rename lite/backends/{intelfpga => intel_fpga}/lldrv/intelfpgadrv.cpp (58%) rename lite/backends/{intelfpga => intel_fpga}/lldrv/intelfpgadrv.h (50%) rename lite/backends/{intelfpga => intel_fpga}/lldrv/utils.cpp (93%) rename lite/backends/{intelfpga => intel_fpga}/lldrv/utils.h (94%) rename lite/backends/{intelfpga => intel_fpga}/target_wrapper.cpp (85%) rename lite/backends/{intelfpga => intel_fpga}/target_wrapper.h (100%) delete mode 100644 lite/backends/intelfpga/CMakeLists.txt create mode 100755 lite/kernels/intel_fpga/CMakeLists.txt rename lite/kernels/{intelfpga => intel_fpga}/conv_compute.cc (92%) rename lite/kernels/{intelfpga => intel_fpga}/conv_compute.h (96%) rename lite/kernels/{intelfpga => intel_fpga}/conv_depthwise.cc (97%) rename lite/kernels/{intelfpga => intel_fpga}/conv_depthwise.h (97%) rename lite/kernels/{intelfpga => intel_fpga}/conv_gemmlike.cc (86%) rename lite/kernels/{intelfpga => intel_fpga}/conv_gemmlike.h (98%) delete mode 100755 lite/kernels/intelfpga/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dd5a87d7ee..12deaf69752 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) -lite_option(LITE_WITH_INTELFPGA "Enable IntelFPGA support in lite" OFF) +lite_option(LITE_WITH_INTEL_FPGA "Enable Intel FPGA support in lite" OFF) lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index d1467704ac9..3e25a41a3ed 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -173,8 +173,8 @@ if (LITE_WITH_FPGA) add_definitions("-DLITE_WITH_FPGA") endif() -if (LITE_WITH_INTELFPGA) -add_definitions("-DLITE_WITH_INTELFPGA") +if (LITE_WITH_INTEL_FPGA) +add_definitions("-DLITE_WITH_INTEL_FPGA") endif() if (LITE_WITH_BM) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index b5f115ca973..40d75eb530b 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS IMAGINATION_NNA_DEPS APU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -82,8 +82,8 @@ function (lite_deps TARGET) endforeach(var) endif() - if (LITE_WITH_INTELFPGA) - foreach(var ${lite_deps_INTELFPGA_DEPS}) + if (LITE_WITH_INTEL_FPGA) + foreach(var ${lite_deps_INTEL_FPGA_DEPS}) set(deps ${deps} ${var}) endforeach(var) endif() @@ -161,7 +161,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -177,7 +177,7 @@ function(lite_cc_library TARGET) ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} - INTELFPGA_DEPS ${args_INTELFPGA_DEPS} + INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -214,7 +214,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -226,7 +226,7 @@ function(lite_cc_binary TARGET) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} - INTELFPGA_DEPS ${args_INTELFPGA_DEPS} + INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -270,7 +270,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -290,7 +290,7 @@ function(lite_cc_test TARGET) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} - INTELFPGA_DEPS ${args_INTELFPGA_DEPS} + INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -327,7 +327,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels") set(x86_kernels CACHE INTERNAL "x86 kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") -set(intelfpga_kernels CACHE INTERNAL "intelfpga kernels") +set(intel_fpga_kernels CACHE INTERNAL "intel_fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(apu_kernels CACHE INTERNAL "apu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") @@ -356,7 +356,7 @@ endif() function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -441,14 +441,14 @@ function(add_kernel TARGET device level) endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() - if ("${device}" STREQUAL "INTELFPGA") - if (NOT LITE_WITH_INTELFPGA) + if ("${device}" STREQUAL "INTEL_FPGA") + if (NOT LITE_WITH_INTEL_FPGA) foreach(src ${args_SRCS}) file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") endforeach() return() endif() - set(intelfpga_kernels "${intelfpga_kernels};${TARGET}" CACHE INTERNAL "") + set(intel_fpga_kernels "${intel_fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "BM") if (NOT LITE_WITH_BM) @@ -533,7 +533,7 @@ function(add_kernel TARGET device level) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} - INTELFPGA_DEPS ${args_INTELFPGA_DEPS} + INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -560,7 +560,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTELFPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS INTEL_FPGA_DEPS BM_DEPS IMAGINATION_NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -592,7 +592,7 @@ function(add_operator TARGET level) CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} - INTELFPGA_DEPS ${args_INTELFPGA_DEPS} + INTEL_FPGA_DEPS ${args_INTEL_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index b875ff80e5b..b76920bd134 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -1,6 +1,6 @@ # PaddleLite使用IntelFPGA预测部署 -Paddle Lite支持基于arm的IntelFPGA c5的模型预测,提供armv7hf的交叉编译 +Paddle Lite支持基于arm的IntelFPGA C5的模型预测,提供armv7hf的交叉编译 PaddleLite通过调用底层驱动实现对FPGA硬件的调度,以及对应的API接口。 @@ -27,11 +27,11 @@ Lite支持IntelFPGA作为后端硬件进行模型推理,其主要特性如下 ## 编译 -需要提前准备带有IntelFPGAdrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码 +需要提前准备带有intelfpgadrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码 CMAKE编译选项: -- 设置`LITE_WITH_INTELFPGA=ON`和`LITE_WITH_ARM=ON` +- 设置`LITE_WITH_INTEL_FPGA=ON`和`LITE_WITH_ARM=ON` 其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。 @@ -47,11 +47,11 @@ CMAKE编译选项: -DLITE_WITH_OPENMP=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DWITH_TESTING=OFF \ - -DLITE_WITH_INTELFPGA=ON \ + -DLITE_WITH_INTEL_FPGA=ON \ -DARM_TARGET_OS=armlinux make publish_inference -j2 ``` -Lite提供FPGA编译脚本,位于lite/tools/build_intel_fpga.sh full_publish,在Lite根目录执行该脚本即可编译 +Lite提供IntelFPGA编译脚本,位于lite/tools/build_intel_fpga.sh full_publish,在Lite根目录执行该脚本即可编译 ## 运行示例 @@ -68,7 +68,7 @@ Password: #密码:Awcloud@123 #进入/opt目录[开发板执行] cd /opt #在运行模型前需要加载FPGA驱动[开发板执行] -insmod driver/IntelFPGAdrv.ko +insmod driver/intelfpgadrv.ko ``` - **使用IntelFPGA进行模型预测** diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 73a921b4b20..64b68cc0c02 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -177,9 +177,9 @@ if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) set(cxx_api_deps ${cxx_api_deps} ${fpga_deps}) endif() -if(LITE_WITH_INTELFPGA) - set(light_api_deps ${light_api_deps} ${intelfpga_deps}) - set(cxx_api_deps ${cxx_api_deps} ${intelfpga_deps}) +if(LITE_WITH_INTEL_FPGA) + set(light_api_deps ${light_api_deps} ${intel_fpga_deps}) + set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps}) endif() if(LITE_WITH_BM) set(light_api_deps ${light_api_deps} ${bm_deps}) @@ -212,7 +212,7 @@ list(LENGTH apu_kernels num_apu_kernels) list(LENGTH xpu_kernels num_xpu_kernels) list(LENGTH rknpu_kernels num_rknpu_kernels) list(LENGTH fpga_kernels num_fpga_kernels) -list(LENGTH intelfpga_kernels num_intelfpga_kernels) +list(LENGTH intel_fpga_kernels num_intel_fpga_kernels) list(LENGTH bm_kernels num_bm_kernels) list(LENGTH mlu_kernels num_mlu_kernels) list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels) @@ -229,7 +229,7 @@ message(STATUS "Collected ${num_apu_kernels} APU kernels") message(STATUS "Collected ${num_xpu_kernels} XPU kernels") message(STATUS "Collected ${num_rknpu_kernels} RKNPU kernels") message(STATUS "Collected ${num_fpga_kernels} FPGA kernels") -message(STATUS "Collected ${num_intelfpga_kernels} INTELFPGA kernels") +message(STATUS "Collected ${num_intel_fpga_kernels} INTEL_FPGA kernels") message(STATUS "Collected ${num_bm_kernels} BM kernels") message(STATUS "Collected ${num_mlu_kernels} MLU kernels") message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels") @@ -254,7 +254,7 @@ if (NOT LITE_ON_TINY_PUBLISH) IMAGINATION_NNA_DEPS ${imagination_nna_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) endif() @@ -278,7 +278,7 @@ lite_cc_library(light_api SRCS light_api.cc RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} BM_DEPS ${bm_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} MLU_DEPS ${mlu_kernels} @@ -303,7 +303,7 @@ if(WITH_TESTING) RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} @@ -360,7 +360,7 @@ if(WITH_TESTING) endif() if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) - set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intelfpga_kernels}) + set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels} ${intel_fpga_kernels}) lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc DEPS ${lite_model_test_DEPS} @@ -459,7 +459,7 @@ if (NOT LITE_ON_TINY_PUBLISH) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} BM_DEPS ${bm_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -479,7 +479,7 @@ if(NOT WITH_COVERAGE) DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -490,7 +490,7 @@ if(NOT WITH_COVERAGE) X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} @@ -535,7 +535,7 @@ if(NOT WITH_COVERAGE) CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} @@ -561,7 +561,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -579,7 +579,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -597,7 +597,7 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -614,7 +614,7 @@ if(NOT IOS) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -633,7 +633,7 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -648,7 +648,7 @@ if(NOT IOS) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - INTELFPGA_DEPS ${intelfpga_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index d47f2a92a6f..8853baae836 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -83,7 +83,7 @@ const std::string& TargetToStr(TargetType target) { "apu", "huawei_ascend_npu", "imagination_nna", - "intelfpga"}; + "intel_fpga"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 848bf47fc3f..7c05e6138f1 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -12,4 +12,4 @@ add_subdirectory(apu) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) -add_subdirectory(intelfpga) +add_subdirectory(intel_fpga) diff --git a/lite/backends/intel_fpga/CMakeLists.txt b/lite/backends/intel_fpga/CMakeLists.txt new file mode 100644 index 00000000000..c47a33be007 --- /dev/null +++ b/lite/backends/intel_fpga/CMakeLists.txt @@ -0,0 +1,23 @@ +if (NOT LITE_WITH_INTEL_FPGA) + return() +endif() + +set(LITE_INTEL_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga") +set(LITE_INTEL_FPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga/lldrv") + +message("intel_fpga_path ${LITE_INTEL_FPGA_PATH}") +file(GLOB INTEL_FPGA_CPP "${LITE_INTEL_FPGA_PATH}/*.cpp") +file(GLOB LLDRV_CPP "${LITE_INTEL_FPGA_LLDRV_PATH}/*.cpp") +message("intel_fpga cpp: ${INTEL_FPGA_CPP}") +set(INTEL_FPGA_ALL_CPP "") +FOREACH(FILE_PATH ${LLDRV_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND INTEL_FPGA_ALL_CPP lldrv/${FILE_NAME}) +ENDFOREACH(FILE_PATH) +FOREACH(FILE_PATH ${INTELFPGA_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND INTEL_FPGA_ALL_CPP ${FILE_NAME}) +ENDFOREACH(FILE_PATH) +message("intel_fpga src: ${INTEL_FPGA_ALL_CPP}") +cc_library(kernel_intel_fpga SRCS ${INTEL_FPGA_ALL_CPP}) +cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intel_fpga) diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp similarity index 58% rename from lite/backends/intelfpga/lldrv/intelfpgadrv.cpp rename to lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp index 55e4bf92f0d..cc188e483cf 100644 --- a/lite/backends/intelfpga/lldrv/intelfpgadrv.cpp +++ b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp @@ -24,22 +24,22 @@ limitations under the License. */ #include #include -#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" +#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" namespace paddle { namespace lite { -namespace intelfpga { +namespace intel_fpga { -/// FD of intelfpga -static int intelfpga_fd = -1; +/// FD of intel_fpga +static int intel_fpga_fd = -1; /// Memory blocks -static struct intelfpga_memblk_s mb, ms, mi, mk, mo; +static struct intel_fpga_memblk_s mb, ms, mi, mk, mo; -int intelfpga_open() { - if (intelfpga_fd < 0) { - intelfpga_fd = open("/dev/intelfpgadrv0", O_RDWR); - if (intelfpga_fd < 0) { +int intel_fpga_open() { + if (intel_fpga_fd < 0) { + intel_fpga_fd = open("/dev/intelfpgadrv0", O_RDWR); + if (intel_fpga_fd < 0) { return -1; } memset(&mb, 0, sizeof(mb)); @@ -52,8 +52,8 @@ int intelfpga_open() { return 0; } -void intelfpga_close() { - if (intelfpga_fd < 0) return; +void intel_fpga_close() { + if (intel_fpga_fd < 0) return; if (mb.addr) { free(mb.addr); @@ -70,16 +70,16 @@ void intelfpga_close() { if (mo.addr) { free(mo.addr); } - close(intelfpga_fd); - intelfpga_fd = -1; + close(intel_fpga_fd); + intel_fpga_fd = -1; } /// memory management; -void* intelfpga_malloc(size_t size) { return malloc(size); } +void* intel_fpga_malloc(size_t size) { return malloc(size); } -void intelfpga_free(void* ptr) { free(ptr); } +void intel_fpga_free(void* ptr) { free(ptr); } -void* intelfpga_mbias(size_t size) { +void* intel_fpga_mbias(size_t size) { if (mb.addr) { if (mb.size >= size) { return mb.addr; @@ -93,7 +93,7 @@ void* intelfpga_mbias(size_t size) { return mb.addr; } -void* intelfpga_mscale(size_t size) { +void* intel_fpga_mscale(size_t size) { if (ms.addr) { if (ms.size >= size) { return ms.addr; @@ -108,7 +108,7 @@ void* intelfpga_mscale(size_t size) { return ms.addr; } -void* intelfpga_minput(size_t size) { +void* intel_fpga_minput(size_t size) { if (mi.addr) { if (mi.size >= size) { return mi.addr; @@ -123,7 +123,7 @@ void* intelfpga_minput(size_t size) { return mi.addr; } -void* intelfpga_mkernel(size_t size) { +void* intel_fpga_mkernel(size_t size) { if (mk.addr) { if (mk.size >= size) { return mk.addr; @@ -138,7 +138,7 @@ void* intelfpga_mkernel(size_t size) { return mk.addr; } -void* intelfpga_moutput(size_t size) { +void* intel_fpga_moutput(size_t size) { if (mo.addr) { if (mo.size >= size) { return mo.addr; @@ -153,40 +153,40 @@ void* intelfpga_moutput(size_t size) { return mo.addr; } -void intelfpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); } +void intel_fpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); } -int intelfpga_info(struct intelfpga_info_s* args) { - int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_INFO); +int intel_fpga_info(struct intel_fpga_info_s* args) { + int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_INFO); - if (intelfpga_open()) return -1; + if (intel_fpga_open()) return -1; - return ioctl(intelfpga_fd, cmd, args); + return ioctl(intel_fpga_fd, cmd, args); } -int intelfpga_conv(struct intelfpga_conv_s* args) { - int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_CONV); +int intel_fpga_conv(struct intel_fpga_conv_s* args) { + int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_CONV); - if (intelfpga_open()) return -1; + if (intel_fpga_open()) return -1; - return ioctl(intelfpga_fd, cmd, args); + return ioctl(intel_fpga_fd, cmd, args); } -int intelfpga_pooling(struct intelfpga_pool_s* args) { - int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_POOL); +int intel_fpga_pooling(struct intel_fpga_pool_s* args) { + int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_POOL); - if (intelfpga_open()) return -1; + if (intel_fpga_open()) return -1; - return ioctl(intelfpga_fd, cmd, args); + return ioctl(intel_fpga_fd, cmd, args); } -int intelfpga_fullconnect(struct intelfpga_fcon_s* args) { - int cmd = INTELFPGA_IOCTL_MAKE(INTELFPGA_CMD_FCON); +int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args) { + int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_FCON); - if (intelfpga_open()) return -1; + if (intel_fpga_open()) return -1; - return ioctl(intelfpga_fd, cmd, args); + return ioctl(intel_fpga_fd, cmd, args); } -} // namespace intelfpga +} // namespace intel_fpga } // namespace lite } // namespace paddle diff --git a/lite/backends/intelfpga/lldrv/intelfpgadrv.h b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h similarity index 50% rename from lite/backends/intelfpga/lldrv/intelfpgadrv.h rename to lite/backends/intel_fpga/lldrv/intelfpgadrv.h index f35c343e030..0a162e7af9d 100644 --- a/lite/backends/intelfpga/lldrv/intelfpgadrv.h +++ b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef _LLDRV_INTELFPGA_H_ -#define _LLDRV_INTELFPGA_H_ +#ifndef _LLDRV_INTEL_FPGA_H_ +#define _LLDRV_INTEL_FPGA_H_ #pragma once @@ -24,38 +24,38 @@ limitations under the License. */ namespace paddle { namespace lite { -namespace intelfpga { +namespace intel_fpga { // Activation type -enum intelfpga_act_e { +enum intel_fpga_act_e { ACT_NONE = 0, ACT_RELU = 1, }; // Device information -struct intelfpga_info_s { +struct intel_fpga_info_s { uint32_t ver; // Version, 00.00.0000 }; -struct intelfpga_reset_s { +struct intel_fpga_reset_s { uint32_t val; // reset command, N/A }; // Memory copy -struct intelfpga_mcopy_s { +struct intel_fpga_mcopy_s { void* src; // source address void* dst; // destination adddress size_t size; // size in bytes }; // Memory block -struct intelfpga_memblk_s { +struct intel_fpga_memblk_s { void* addr; // base address size_t size; // size in bytes }; // Kernel -struct intelfpga_kernel_s { +struct intel_fpga_kernel_s { uint32_t kw; // width uint32_t kh; // height uint32_t ws; // width stride(s) @@ -63,7 +63,7 @@ struct intelfpga_kernel_s { }; // Input parameters, nchw -struct intelfpga_input_s { +struct intel_fpga_input_s { uint32_t in; // nbr of batch {1} uint32_t ic; // nbr of channels {1} uint32_t iw; // width @@ -77,7 +77,7 @@ struct intelfpga_input_s { }; // Output parameters, nchw -struct intelfpga_output_s { +struct intel_fpga_output_s { uint32_t on; // nbr of batch {1} uint32_t oc; // nbr of channels {1} uint32_t ow; // width @@ -85,20 +85,20 @@ struct intelfpga_output_s { }; // Basic convolution -struct intelfpga_conv_s { - uint32_t at; // activation type {0}, None=0, RELU=1 - uint32_t ng; // nbr of groups {1} - int8_t* ia; // input address, INT8[N,Ci,Hi,Wi] - int8_t* ka; // kernel address, INT32[Co,Ci,Hk,Wk] - int32_t* ba; // bias address, INT32[Co,1] - int32_t* oa; // output address, INT32[N,Co,Ho,Wo] - struct intelfpga_input_s i; // input - struct intelfpga_kernel_s k; // kernel - struct intelfpga_output_s o; // output +struct intel_fpga_conv_s { + uint32_t at; // activation type {0}, None=0, RELU=1 + uint32_t ng; // nbr of groups {1} + int8_t* ia; // input address, INT8[N,Ci,Hi,Wi] + int8_t* ka; // kernel address, INT32[Co,Ci,Hk,Wk] + int32_t* ba; // bias address, INT32[Co,1] + int32_t* oa; // output address, INT32[N,Co,Ho,Wo] + struct intel_fpga_input_s i; // input + struct intel_fpga_kernel_s k; // kernel + struct intel_fpga_output_s o; // output }; // Pooling convolution -struct intelfpga_pool_s { +struct intel_fpga_pool_s { uint32_t gp : 1; // global pooling {0} uint32_t pm : 1; // pooling mode {0}, Max=0, AVG=1 uint32_t cm : 1; // ceil mode {0}, ceil=0, floor=1 @@ -106,13 +106,13 @@ struct intelfpga_pool_s { uint32_t reserved : 28; // reserved {0} int32_t* ia; // input address, INT32[N,Ci,Hi,Wi] int32_t* oa; // output address, INT32[N,Ci,Ho,Wo] - struct intelfpga_input_s i; // input - struct intelfpga_kernel_s k; // kernel - struct intelfpga_output_s o; // output + struct intel_fpga_input_s i; // input + struct intel_fpga_kernel_s k; // kernel + struct intel_fpga_output_s o; // output }; // Full connection -struct intelfpga_fcon_s { +struct intel_fpga_fcon_s { uint32_t at; // activation type {0}, None=0, RELU=1 int8_t* ia; // input address, INT8[M,K] int8_t* ka; // kernel address, INT8[K,N] @@ -122,65 +122,65 @@ struct intelfpga_fcon_s { }; // Regisger access -struct intelfpga_creg_s { +struct intel_fpga_creg_s { uint32_t addr; uint32_t data; }; -#define INTELFPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4) +#define INTEL_FPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4) /* Ioctls */ -#define INTELFPGA_IOCTL_MAKE(cmd) (_IO(INTELFPGA_MAGIC_ID, cmd)) -#define INTELFPGA_IOCTL_GET(cmd) (_IOC_NR(cmd)) -#define INTELFPGA_IOCTL_VALID(cmd) \ - ((_IOC_TYPE(cmd) == INTELFPGA_MAGIC_ID) ? 1 : 0) +#define INTEL_FPGA_IOCTL_MAKE(cmd) (_IO(INTEL_FPGA_MAGIC_ID, cmd)) +#define INTEL_FPGA_IOCTL_GET(cmd) (_IOC_NR(cmd)) +#define INTEL_FPGA_IOCTL_VALID(cmd) \ + ((_IOC_TYPE(cmd) == INTEL_FPGA_MAGIC_ID) ? 1 : 0) -#define INTELFPGA_CMD_INFO 0x00 // struct intelfpga_info_s -#define INTELFPGA_CMD_RESET 0x01 // struct intelfpga_reset_s +#define INTEL_FPGA_CMD_INFO 0x00 // struct intel_fpga_info_s +#define INTEL_FPGA_CMD_RESET 0x01 // struct intel_fpga_reset_s -#define INTELFPGA_CMD_MCOPY 0x10 // struct intelfpga_mcopy_s -#define INTELFPGA_CMD_INVAL 0x11 // struct intelfpga_cache_s -#define INTELFPGA_CMD_FLUSH 0x12 // struct intelfpga_cache_s +#define INTEL_FPGA_CMD_MCOPY 0x10 // struct intel_fpga_mcopy_s +#define INTEL_FPGA_CMD_INVAL 0x11 // struct intel_fpga_cache_s +#define INTEL_FPGA_CMD_FLUSH 0x12 // struct intel_fpga_cache_s -#define INTELFPGA_CMD_CONV 0x20 // struct intelfpga_conv_s -#define INTELFPGA_CMD_POOL 0x21 // struct intelfpga_pool_s -#define INTELFPGA_CMD_FCON 0x22 // struct intelfpga_fcon_s +#define INTEL_FPGA_CMD_CONV 0x20 // struct intel_fpga_conv_s +#define INTEL_FPGA_CMD_POOL 0x21 // struct intel_fpga_pool_s +#define INTEL_FPGA_CMD_FCON 0x22 // struct intel_fpga_fcon_s -#define INTELFPGA_CMD_REGRD 0xC0 // struct intelfpga_register_s -#define INTELFPGA_CMD_REGWR 0xC1 // struct intelfpga_register_s +#define INTEL_FPGA_CMD_REGRD 0xC0 // struct intel_fpga_register_s +#define INTEL_FPGA_CMD_REGWR 0xC1 // struct intel_fpga_register_s //--------------------------------------------------------------------------- // device open/close -int intelfpga_open(); -void intelfpga_close(); +int intel_fpga_open(); +void intel_fpga_close(); -void intelfpga_reset(struct intelfpga_reset_s* args); +void intel_fpga_reset(struct intel_fpga_reset_s* args); // memory management -void* intelfpga_malloc(size_t size); -void intelfpga_free(void* ptr); +void* intel_fpga_malloc(size_t size); +void intel_fpga_free(void* ptr); -void* intelfpga_mbias(size_t size); -void* intelfpga_mscale(size_t size); -void* intelfpga_minput(size_t size); -void* intelfpga_mkernel(size_t size); -void* intelfpga_moutput(size_t size); +void* intel_fpga_mbias(size_t size); +void* intel_fpga_mscale(size_t size); +void* intel_fpga_minput(size_t size); +void* intel_fpga_mkernel(size_t size); +void* intel_fpga_moutput(size_t size); -void intelfpga_copy(void* dst, void* src, int size); -int intelfpga_flush(void* addr, size_t size); -int intelfpga_invalidate(void* addr, size_t size); +void intel_fpga_copy(void* dst, void* src, int size); +int intel_fpga_flush(void* addr, size_t size); +int intel_fpga_invalidate(void* addr, size_t size); // device information -int intelfpga_info(struct intelfpga_info_s* args); +int intel_fpga_info(struct intel_fpga_info_s* args); // convolution process -int intelfpga_conv(struct intelfpga_conv_s* args); -int intelfpga_pooling(struct intelfpga_pool_s* args); -int intelfpga_fullconnect(struct intelfpga_fcon_s* args); +int intel_fpga_conv(struct intel_fpga_conv_s* args); +int intel_fpga_pooling(struct intel_fpga_pool_s* args); +int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args); -} // namespace intelfpga +} // namespace intel_fpga } // namespace lite } // namespace paddle -#endif // _LLDRV_INTELFPGA_H_ +#endif // _LLDRV_INTEL_FPGA_H_ diff --git a/lite/backends/intelfpga/lldrv/utils.cpp b/lite/backends/intel_fpga/lldrv/utils.cpp similarity index 93% rename from lite/backends/intelfpga/lldrv/utils.cpp rename to lite/backends/intel_fpga/lldrv/utils.cpp index 0ad6fb9836d..380e79e4d31 100644 --- a/lite/backends/intelfpga/lldrv/utils.cpp +++ b/lite/backends/intel_fpga/lldrv/utils.cpp @@ -17,11 +17,11 @@ limitations under the License. */ #include #include -#include "lite/backends/intelfpga/lldrv/utils.h" +#include "lite/backends/intel_fpga/lldrv/utils.h" namespace paddle { namespace lite { -namespace intelfpga { +namespace intel_fpga { float find_max(const float* data, int size) { float max = 0.0; @@ -67,6 +67,6 @@ void quantize_s32(const float* src, int32_t* dst, int size, float factor) { dst[i] = (int32_t)fdata; } } -} // namespace intelfpga +} // namespace intel_fpga } // namespace lite } // namespace paddle diff --git a/lite/backends/intelfpga/lldrv/utils.h b/lite/backends/intel_fpga/lldrv/utils.h similarity index 94% rename from lite/backends/intelfpga/lldrv/utils.h rename to lite/backends/intel_fpga/lldrv/utils.h index d3883cc3e07..ad8e403afd8 100644 --- a/lite/backends/intelfpga/lldrv/utils.h +++ b/lite/backends/intel_fpga/lldrv/utils.h @@ -21,13 +21,13 @@ limitations under the License. */ namespace paddle { namespace lite { -namespace intelfpga { +namespace intel_fpga { float find_max(const float* data, int size); void quantize_s8(const float* src, int8_t* dst, int size, float factor); void quantize_s32(const float* src, int32_t* dst, int size, float factor); -} // namespace intelfpga +} // namespace intel_fpga } // namespace lite } // namespace paddle diff --git a/lite/backends/intelfpga/target_wrapper.cpp b/lite/backends/intel_fpga/target_wrapper.cpp similarity index 85% rename from lite/backends/intelfpga/target_wrapper.cpp rename to lite/backends/intel_fpga/target_wrapper.cpp index c2de3ff6bfb..0d567016c91 100644 --- a/lite/backends/intelfpga/target_wrapper.cpp +++ b/lite/backends/intel_fpga/target_wrapper.cpp @@ -12,19 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/backends/intelfpga/target_wrapper.h" -#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" +#include "lite/backends/intel_fpga/target_wrapper.h" +#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" #include "lite/utils/all.h" namespace paddle { namespace lite { void* TargetWrapper::Malloc(size_t size) { - return intelfpga::intelfpga_malloc(size); + return intel_fpga::intel_fpga_malloc(size); } void TargetWrapper::Free(void* ptr) { - intelfpga::intelfpga_free(ptr); + intel_fpga::intel_fpga_free(ptr); } void TargetWrapper::MemcpySync(void* dst, diff --git a/lite/backends/intelfpga/target_wrapper.h b/lite/backends/intel_fpga/target_wrapper.h similarity index 100% rename from lite/backends/intelfpga/target_wrapper.h rename to lite/backends/intel_fpga/target_wrapper.h diff --git a/lite/backends/intelfpga/CMakeLists.txt b/lite/backends/intelfpga/CMakeLists.txt deleted file mode 100644 index 1ee8eccae05..00000000000 --- a/lite/backends/intelfpga/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -if (NOT LITE_WITH_INTELFPGA) - return() -endif() - -set(LITE_INTELFPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga") -set(LITE_INTELFPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intelfpga/lldrv") - -message("intelfpga_path ${LITE_INTELFPGA_PATH}") -file(GLOB INTELFPGA_CPP "${LITE_INTELFPGA_PATH}/*.cpp") -file(GLOB LLDRV_CPP "${LITE_INTELFPGA_LLDRV_PATH}/*.cpp") -message("intelfpga cpp: ${INTELFPGA_CPP}") -set(INTELFPGA_ALL_CPP "") -FOREACH(FILE_PATH ${LLDRV_CPP}) - STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) - list(APPEND INTELFPGA_ALL_CPP lldrv/${FILE_NAME}) -ENDFOREACH(FILE_PATH) -FOREACH(FILE_PATH ${INTELFPGA_CPP}) - STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) - list(APPEND INTELFPGA_ALL_CPP ${FILE_NAME}) -ENDFOREACH(FILE_PATH) -message("intelfpga src: ${INTELFPGA_ALL_CPP}") -cc_library(kernel_intelfpga SRCS ${INTELFPGA_ALL_CPP}) -cc_library(intelfpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intelfpga) diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 18ed6d7f9a8..08881d6f523 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -8,7 +8,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper - INTELFPGA_DEPS intelfpga_target_wrapper + INTEL_FPGA_DEPS intel_fpga_target_wrapper BM_DEPS target_wrapper_bm MLU_DEPS target_wrapper_mlu) diff --git a/lite/core/context.h b/lite/core/context.h index e8789d16ea7..6455a8b972a 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -328,7 +328,7 @@ class Context { }; #endif -#ifdef LITE_WITH_INTELFPGA +#ifdef LITE_WITH_INTEL_FPGA // TODO(xbeu): add needed implementation to context template <> class Context { @@ -563,7 +563,7 @@ class ContextScheduler { &ctx->As()); break; #endif -#ifdef LITE_WITH_INTELFPGA +#ifdef LITE_WITH_INTEL_FPGA case TARGET(kIntelFPGA): kernel_contexts_[TargetType::kIntelFPGA] .As() @@ -625,7 +625,7 @@ class ContextScheduler { #ifdef LITE_WITH_FPGA InitContext(); #endif -#ifdef LITE_WITH_INTELFPGA +#ifdef LITE_WITH_INTEL_FPGA InitContext(); #endif #ifdef LITE_WITH_NPU diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 52649cdc520..343da4968ae 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -16,4 +16,4 @@ add_subdirectory(bm) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) -add_subdirectory(intelfpga) +add_subdirectory(intel_fpga) diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt new file mode 100755 index 00000000000..276f4cb7e54 --- /dev/null +++ b/lite/kernels/intel_fpga/CMakeLists.txt @@ -0,0 +1,9 @@ +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTEL_FPGA)) + return() +endif() + +set(intel_fpga_deps intel_fpga_target_wrapper kernel_intel_fpga) + +add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps}) +add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps}) +add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga) diff --git a/lite/kernels/intelfpga/conv_compute.cc b/lite/kernels/intel_fpga/conv_compute.cc similarity index 92% rename from lite/kernels/intelfpga/conv_compute.cc rename to lite/kernels/intel_fpga/conv_compute.cc index e0c75367bd2..763ca83c7a2 100644 --- a/lite/kernels/intelfpga/conv_compute.cc +++ b/lite/kernels/intel_fpga/conv_compute.cc @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/intelfpga/conv_compute.h" +#include "lite/kernels/intel_fpga/conv_compute.h" #include #include "lite/core/op_registry.h" #include "lite/core/type_system.h" -#include "lite/kernels/intelfpga/conv_depthwise.h" -#include "lite/kernels/intelfpga/conv_gemmlike.h" +#include "lite/kernels/intel_fpga/conv_depthwise.h" +#include "lite/kernels/intel_fpga/conv_gemmlike.h" namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { #define PARAM_INIT \ auto& param = this->Param(); \ auto w_dims = param.filter->dims(); \ @@ -73,13 +73,13 @@ void ConvCompute::PrepareForRun() { is_first_epoch_ = false; } -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle -typedef paddle::lite::kernels::intelfpga::ConvCompute +typedef paddle::lite::kernels::intel_fpga::ConvCompute ConvFp32; REGISTER_LITE_KERNEL(conv2d, kIntelFPGA, kFloat, kNCHW, ConvFp32, def) diff --git a/lite/kernels/intelfpga/conv_compute.h b/lite/kernels/intel_fpga/conv_compute.h similarity index 96% rename from lite/kernels/intelfpga/conv_compute.h rename to lite/kernels/intel_fpga/conv_compute.h index a9fd135e431..604972c2914 100644 --- a/lite/kernels/intelfpga/conv_compute.h +++ b/lite/kernels/intel_fpga/conv_compute.h @@ -20,7 +20,7 @@ namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { template class ConvCompute : public KernelLite { @@ -49,7 +49,7 @@ class ConvCompute : public KernelLite { KernelLite* impl_{nullptr}; }; -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/intelfpga/conv_depthwise.cc b/lite/kernels/intel_fpga/conv_depthwise.cc similarity index 97% rename from lite/kernels/intelfpga/conv_depthwise.cc rename to lite/kernels/intel_fpga/conv_depthwise.cc index 80cab07e848..96f5f3512a0 100644 --- a/lite/kernels/intelfpga/conv_depthwise.cc +++ b/lite/kernels/intel_fpga/conv_depthwise.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/intelfpga/conv_depthwise.h" +#include "lite/kernels/intel_fpga/conv_depthwise.h" #include "lite/backends/arm/math/conv_block_utils.h" #include "lite/backends/arm/math/conv_impl.h" namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { template <> void DepthwiseConv::ReInitWhenNeeded() {} @@ -122,7 +122,7 @@ void DepthwiseConv::Run() { w_scale_.data()); } -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/intelfpga/conv_depthwise.h b/lite/kernels/intel_fpga/conv_depthwise.h similarity index 97% rename from lite/kernels/intelfpga/conv_depthwise.h rename to lite/kernels/intel_fpga/conv_depthwise.h index 3f9bf657e02..7f1e2f31b47 100644 --- a/lite/kernels/intelfpga/conv_depthwise.h +++ b/lite/kernels/intel_fpga/conv_depthwise.h @@ -25,7 +25,7 @@ namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { template class DepthwiseConv : public KernelLite { @@ -61,7 +61,7 @@ class DepthwiseConv : public KernelLite { std::vector w_scale_; }; -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/intelfpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc similarity index 86% rename from lite/kernels/intelfpga/conv_gemmlike.cc rename to lite/kernels/intel_fpga/conv_gemmlike.cc index 2131d2c032f..bc9b6f68014 100644 --- a/lite/kernels/intelfpga/conv_gemmlike.cc +++ b/lite/kernels/intel_fpga/conv_gemmlike.cc @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/intelfpga/conv_gemmlike.h" +#include "lite/kernels/intel_fpga/conv_gemmlike.h" #include #include "lite/backends/arm/math/gemm_prepacked_int8.h" #include "lite/backends/arm/math/packed_sgemm.h" -#include "lite/backends/intelfpga/lldrv/intelfpgadrv.h" -#include "lite/backends/intelfpga/lldrv/utils.h" +#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" +#include "lite/backends/intel_fpga/lldrv/utils.h" namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { template <> void GemmLikeConv::PrepareForRun() { @@ -67,7 +67,7 @@ void GemmLikeConv::Run() { if (kh > 1 && kw > 1) { int i, j, il, kl, ol, l, m, n, k; - lite::intelfpga::intelfpga_conv_s conv; + lite::intel_fpga::intel_fpga_conv_s conv; conv.at = static_cast(param.activation_param.active_type); if (conv.at == 4) { @@ -100,26 +100,26 @@ void GemmLikeConv::Run() { kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw; ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow; conv.ia = static_cast( - lite::intelfpga::intelfpga_minput(il * sizeof(int8_t))); + lite::intel_fpga::intel_fpga_minput(il * sizeof(int8_t))); conv.ka = static_cast( - lite::intelfpga::intelfpga_mkernel(kl * sizeof(int8_t))); + lite::intel_fpga::intel_fpga_mkernel(kl * sizeof(int8_t))); conv.oa = static_cast( - lite::intelfpga::intelfpga_moutput(ol * sizeof(int32_t))); + lite::intel_fpga::intel_fpga_moutput(ol * sizeof(int32_t))); if (conv.ia && conv.ka && conv.oa) { - float fd = lite::intelfpga::find_max(i_data, il); - float fw = lite::intelfpga::find_max(w_data, kl); + float fd = lite::intel_fpga::find_max(i_data, il); + float fw = lite::intel_fpga::find_max(w_data, kl); fd = 127.0 / fd; fw = 127.0 / fw; // y = 127.0 / fmax // y = x * scale; - lite::intelfpga::quantize_s8(i_data, conv.ia, il, fd); - lite::intelfpga::quantize_s8(w_data, conv.ka, kl, fw); + lite::intel_fpga::quantize_s8(i_data, conv.ia, il, fd); + lite::intel_fpga::quantize_s8(w_data, conv.ka, kl, fw); // perform conv2d - if (lite::intelfpga::intelfpga_conv(&conv)) { - std::cout << "intelfpga_conv error" << std::endl; + if (lite::intel_fpga::intel_fpga_conv(&conv)) { + std::cout << "intel_fpga_conv error" << std::endl; } // Convert int32 back to fp32, [n,c,h,w] // 1. y = x / scale @@ -179,7 +179,7 @@ void GemmLikeConv::Run() { } } -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/intelfpga/conv_gemmlike.h b/lite/kernels/intel_fpga/conv_gemmlike.h similarity index 98% rename from lite/kernels/intelfpga/conv_gemmlike.h rename to lite/kernels/intel_fpga/conv_gemmlike.h index 812271010c7..338a711983c 100644 --- a/lite/kernels/intelfpga/conv_gemmlike.h +++ b/lite/kernels/intel_fpga/conv_gemmlike.h @@ -26,7 +26,7 @@ namespace paddle { namespace lite { namespace kernels { -namespace intelfpga { +namespace intel_fpga { template class GemmLikeConv : public KernelLite { @@ -106,7 +106,7 @@ class GemmLikeConv : public KernelLite { int workspace_size_{0}; }; -} // namespace intelfpga +} // namespace intel_fpga } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/intelfpga/CMakeLists.txt b/lite/kernels/intelfpga/CMakeLists.txt deleted file mode 100755 index 4f2fbe6d5d2..00000000000 --- a/lite/kernels/intelfpga/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_INTELFPGA)) - return() -endif() - -set(intelfpga_deps intelfpga_target_wrapper kernel_intelfpga) - -add_kernel(conv_depthwise_intelfpga INTELFPGA basic SRCS conv_depthwise.cc DEPS ${intelfpga_deps}) -add_kernel(conv_gemmlike_intelfpga INTELFPGA basic SRCS conv_gemmlike.cc DEPS ${intelfpga_deps}) -add_kernel(conv_compute_intelfpga INTELFPGA basic SRCS conv_compute.cc DEPS ${intelfpga_deps} conv_depthwise_intelfpga conv_gemmlike_intelfpga) diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh index ef647df315c..53b22b0b085 100755 --- a/lite/tools/build_intel_fpga.sh +++ b/lite/tools/build_intel_fpga.sh @@ -25,7 +25,7 @@ OPTMODEL_DIR="" # options of compiling OPENCL lib. WITH_OPENCL=OFF # options of compiling intel fpga. -WITH_INTELFPGA=ON +WITH_INTEL_FPGA=ON # options of adding training ops WITH_TRAIN=OFF # num of threads used during compiling.. @@ -59,7 +59,7 @@ function init_cmake_mutable_options { -DLITE_BUILD_TAILOR=$WITH_STRIP \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_OPENCL=$WITH_OPENCL \ - -DLITE_WITH_INTELFPGA=$WITH_INTELFPGA \ + -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \ -DLITE_WITH_TRAIN=$WITH_TRAIN" } ##################################################################################################### @@ -290,8 +290,8 @@ function main { shift ;; # compiling lib which can operate on intel fpga. - --with_intelfpga=*) - WITH_INTELFPGA="${i#*=}" + --with_intel_fpga=*) + WITH_INTEL_FPGA="${i#*=}" shift ;; # ON or OFF, default OFF From 9f0def80bdbadcb0892b592b5d447e88b5c7874f Mon Sep 17 00:00:00 2001 From: xbeu Date: Thu, 18 Mar 2021 03:18:27 +0000 Subject: [PATCH 05/19] test=develop --- lite/tools/build_intel_fpga.sh | 324 --------------------------------- lite/tools/build_linux.sh | 8 + 2 files changed, 8 insertions(+), 324 deletions(-) delete mode 100755 lite/tools/build_intel_fpga.sh diff --git a/lite/tools/build_intel_fpga.sh b/lite/tools/build_intel_fpga.sh deleted file mode 100755 index 53b22b0b085..00000000000 --- a/lite/tools/build_intel_fpga.sh +++ /dev/null @@ -1,324 +0,0 @@ -#!/bin/bash -set -e - -##################################################################################################### -# 1. global variables, you can change them according to your requirements -##################################################################################################### -# armv7hf. -ARCH=armv7hf -# gcc or clang, default gcc. -TOOLCHAIN=gcc -# ON or OFF, default OFF. -WITH_EXTRA=ON -# controls whether to compile python lib, default is OFF. -WITH_PYTHON=OFF -PY_VERSION="" -# controls whether to compile cv functions into lib, default is OFF. -WITH_CV=OFF -# controls whether to print log information, default is ON. -WITH_LOG=OFF -# controls whether to throw the exception when error occurs, default is OFF -WITH_EXCEPTION=OFF -# options of striping lib according to input model. -WITH_STRIP=OFF -OPTMODEL_DIR="" -# options of compiling OPENCL lib. -WITH_OPENCL=OFF -# options of compiling intel fpga. -WITH_INTEL_FPGA=ON -# options of adding training ops -WITH_TRAIN=OFF -# num of threads used during compiling.. -readonly NUM_PROC=${LITE_BUILD_THREADS:-4} -##################################################################################################### - -##################################################################################################### -# 2. local variables, these variables should not be changed. -##################################################################################################### -# url that stores third-party zip file to accelerate third-paty lib installation -readonly THIRDPARTY_TAR=https://paddlelite-data.bj.bcebos.com/third_party_libs/third-party-ea5576.tar.gz -# absolute path of Paddle-Lite. -readonly workspace=$PWD/$(dirname $0)/../../ -# basic options for linux compiling. -readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ - -DLITE_WITH_ARM=ON \ - -DLITE_WITH_X86=OFF \ - -DARM_TARGET_OS=armlinux \ - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ - -DWITH_TESTING=OFF" -# mutable options for linux compiling. -function init_cmake_mutable_options { - cmake_mutable_options="-DARM_TARGET_ARCH_ABI=$ARCH \ - -DARM_TARGET_LANG=$TOOLCHAIN \ - -DLITE_BUILD_EXTRA=$WITH_EXTRA \ - -DLITE_WITH_PYTHON=$WITH_PYTHON \ - -DPY_VERSION=$PY_VERSION \ - -DLITE_WITH_CV=$WITH_CV \ - -DLITE_WITH_LOG=$WITH_LOG \ - -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ - -DLITE_BUILD_TAILOR=$WITH_STRIP \ - -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ - -DLITE_WITH_OPENCL=$WITH_OPENCL \ - -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \ - -DLITE_WITH_TRAIN=$WITH_TRAIN" -} -##################################################################################################### - -#################################################################################################### -# 3. functions of prepare workspace before compiling -#################################################################################################### - -# 3.1 generate `__generated_code__.cc`, which is dependended by some targets in cmake. -# here we fake an empty file to make cmake works. -function prepare_workspace { - local root_dir=$1 - local build_dir=$2 - # in build directory - # 1. Prepare gen_code file - GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code - mkdir -p ${GEN_CODE_PATH_PREFIX} - touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc - # 2.Prepare debug tool - DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug - mkdir -p ${DEBUG_TOOL_PATH_PREFIX} - cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/ -} - -# 3.2 prepare source code of opencl lib -# here we bundle all cl files into a cc file to bundle all opencl kernels into a single lib -function prepare_opencl_source_code { - local root_dir=$1 - local build_dir=$2 - # in build directory - # Prepare opencl_kernels_source.cc file - GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl - rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc - OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel - mkdir -p ${GEN_CODE_PATH_OPENCL} - touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc - python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc -} - -# 3.3 prepare third_party libraries for compiling -# here we store third_party libraries into Paddle-Lite/third-party -function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-ea5576.tar.gz ]; then - rm -rf $workspace/third-party - if [ ! -f $workspace/third-party-ea5576.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-ea5576.tar.gz - else - git submodule update --init --recursive - fi -} -#################################################################################################### - -#################################################################################################### -# 4. compiling functions -#################################################################################################### - -# 4.1 function of tiny_publish compiling -# here we only compile light_api lib -function make_tiny_publish_so { - is_tiny=${1:-ON} - if [ "$WITH_PYTHON" = "ON" -a "$is_tiny" = "ON" ]; then - echo "Warning: build full_publish to use python." - is_tiny=OFF - fi - if [ "$WITH_TRAIN" = "ON" -a "$is_tiny" = "ON" ]; then - echo "Warning: build full_publish to add training ops." - is_tiny=OFF - fi - if [ "$BUILD_TAILOR" = "ON" -a "$OPTMODEL_DIR" = "" ]; then - echo "Error: set OPTMODEL_DIR if BUILD_TAILOR is ON." - fi - - if [ "$is_tiny" = "OFF" ]; then - prepare_thirdparty - fi - - build_dir=$workspace/build.lite.linux.$ARCH.$TOOLCHAIN - if [ "${WITH_OPENCL}" = "ON" ]; then - build_dir=${build_dir}.opencl - fi - - if [ -d $build_dir ]; then - rm -rf $build_dir - fi - mkdir -p $build_dir - cd $build_dir - - prepare_workspace $workspace $build_dir - - if [ "${WITH_OPENCL}" = "ON" ]; then - prepare_opencl_source_code $workspace $build_dir - fi - if [ "${WITH_STRIP}" == "ON" ]; then - WITH_EXTRA=ON - fi - - init_cmake_mutable_options - cmake $workspace \ - ${CMAKE_COMMON_OPTIONS} \ - ${cmake_mutable_options} \ - -DLITE_ON_TINY_PUBLISH=$is_tiny - - if [ "${WITH_OPENCL}" = "ON" ]; then - make opencl_clhpp -j$NUM_PROC - fi - - make publish_inference -j$NUM_PROC - cd - > /dev/null -} -#################################################################################################### - -# 4.2 function of full_publish compiling -# here we compile both light_api lib and full_api lib -function make_full_publish_so { - make_tiny_publish_so OFF -} -#################################################################################################### - -function print_usage { - echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" - echo -e "| Methods of compiling Padddle-Lite Linux library: |" - echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" - echo -e "| compile linux library: (armv8, gcc) |" - echo -e "| ./lite/tools/build_linux.sh |" - echo -e "| print help information: |" - echo -e "| ./lite/tools/build_linux.sh help |" - echo -e "| |" - echo -e "| optional argument: |" - echo -e "| --arch: (armv8|armv7hf|armv7), default is armv8 |" - echo -e "| --toolchain: (gcc|clang), defalut is gcc |" - echo -e "| --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP), default is OFF |" - echo -e "| --with_python: (OFF|ON); controls whether to build python lib or whl, default is OFF |" - echo -e "| --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None |" - echo -e "| --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF |" - echo -e "| --with_log: (OFF|ON); controls whether to print log information, default is ON |" - echo -e "| --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF |" - echo -e "| |" - echo -e "| arguments of striping lib according to input model: |" - echo -e "| ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir |" - echo -e "| --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF |" - echo -e "| --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library |" - echo -e "| detailed information about striping lib: https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html |" - echo -e "| |" - echo -e "| arguments of opencl library compiling: |" - echo -e "| ./lite/tools/build_linux.sh --with_opencl=ON |" - echo -e "| --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF |" - echo -e "| |" - echo -e "| arguments of rockchip npu library compiling: |" - echo -e "| ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath |" - echo -e "| --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF |" - echo -e "| --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library |" - echo -e "| you can download rockchip NPU SDK from: https://github.com/airockchip/rknpu_ddk.git |" - echo -e "| detailed information about Paddle-Lite RKNPU: https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html |" - echo -e "| |" - echo -e "| arguments of baidu xpu library compiling: |" - echo -e "| ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath |" - echo -e "| --with_baidu_xpu: (OFF|ON); controls whether to compile lib for baidu_xpu, default is OFF |" - echo -e "| --baidu_xpu_sdk_root: (path to baidu_xpu DDK file) required when compiling baidu_xpu library |" - echo "--------------------------------------------------------------------------------------------------------------------------------------------------------" - echo -} - -function main { - if [ -z "$1" ]; then - # compiling result contains light_api lib only, recommanded. - make_tiny_publish_so - exit 0 - fi - - # Parse command line. - for i in "$@"; do - case $i in - # armv8 or armv7hf or armv7, default armv8 - --arch=*) - ARCH="${i#*=}" - shift - ;; - # gcc or clang, default gcc - --toolchain=*) - TOOLCHAIN="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_extra=*) - WITH_EXTRA="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_python=*) - WITH_PYTHON="${i#*=}" - shift - ;; - # 2.7 or 3.5 or 3.7, default is None - --python_version=*) - PY_VERSION="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_cv=*) - WITH_CV="${i#*=}" - shift - ;; - # ON or OFF, default ON - --with_log=*) - WITH_LOG="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_exception=*) - WITH_EXCEPTION="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_strip=*) - BUILD_TAILOR="${i#*=}" - shift - ;; - # string, absolute path to optimized model dir - --opt_model_dir=*) - OPTMODEL_DIR="${i#*=}" - shift - ;; - # compiling lib which can operate on opencl and cpu. - --with_opencl=*) - WITH_OPENCL="${i#*=}" - shift - ;; - # compiling lib which can operate on intel fpga. - --with_intel_fpga=*) - WITH_INTEL_FPGA="${i#*=}" - shift - ;; - # ON or OFF, default OFF - --with_train=*) - WITH_TRAIN="${i#*=}" - shift - ;; - # compiling result contains both light_api and cxx_api lib. - full_publish) - make_full_publish_so - exit 0 - ;; - # print help info - help) - print_usage - exit 0 - ;; - # unknown option - *) - echo "Error: unsupported argument \"${i#*=}\"" - print_usage - exit 1 - ;; - esac - done - # compiling result contains light_api lib only, recommanded. - make_tiny_publish_so -} - -main $@ diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index 68745c8ca34..0857df30bbd 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -33,6 +33,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk" # options of compiling baidu XPU lib. WITH_BAIDU_XPU=OFF BAIDU_XPU_SDK_ROOT="" +# options of compiling intel fpga. +WITH_INTEL_FPGA=OFF # options of adding training ops WITH_TRAIN=OFF # num of threads used during compiling.. @@ -75,6 +77,7 @@ function init_cmake_mutable_options { -DXPU_SDK_ROOT=$BAIDU_XPU_SDK_ROOT \ -DLITE_WITH_TRAIN=$WITH_TRAIN \ -DLITE_WITH_IMAGINATION_NNA=$WITH_IMAGINATION_NNA \ + -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \ -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT}" } @@ -341,6 +344,11 @@ function main { BAIDU_XPU_SDK_ROOT="${i#*=}" shift ;; + # compiling lib which can operate on intel fpga. + --with_intel_fpga=*) + WITH_INTEL_FPGA="${i#*=}" + shift + ;; # ON or OFF, default OFF --with_train=*) WITH_TRAIN="${i#*=}" From a9685c4c54ea0eeaaefed2d487d60dc97db35cd4 Mon Sep 17 00:00:00 2001 From: xbeu Date: Mon, 22 Mar 2021 09:12:50 +0000 Subject: [PATCH 06/19] test=develop --- CMakeLists.txt | 4 + cmake/device/intel_fpga.cmake | 48 +++++ lite/backends/intel_fpga/CMakeLists.txt | 20 +- .../intel_fpga/lldrv/intelfpgadrv.cpp | 192 ------------------ lite/backends/intel_fpga/lldrv/intelfpgadrv.h | 186 ----------------- lite/backends/intel_fpga/lldrv/utils.cpp | 72 ------- lite/backends/intel_fpga/lldrv/utils.h | 33 --- lite/backends/intel_fpga/target_wrapper.cpp | 7 +- lite/backends/intel_fpga/target_wrapper.h | 1 + lite/kernels/intel_fpga/CMakeLists.txt | 6 +- lite/tools/build_linux.sh | 14 +- 11 files changed, 70 insertions(+), 513 deletions(-) create mode 100644 cmake/device/intel_fpga.cmake delete mode 100644 lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp delete mode 100644 lite/backends/intel_fpga/lldrv/intelfpgadrv.h delete mode 100644 lite/backends/intel_fpga/lldrv/utils.cpp delete mode 100644 lite/backends/intel_fpga/lldrv/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 12deaf69752..2960fb0b44d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,6 +185,10 @@ if(LITE_WITH_IMAGINATION_NNA) include(device/imagination_nna) endif() +if(LITE_WITH_INTEL_FPGA) + include(device/intel_fpga) +endif() + # flatbuffer module for loading model if(LITE_UPDATE_FBS_HEAD) include(external/flatbuffers) diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake new file mode 100644 index 00000000000..498f58bfbdc --- /dev/null +++ b/cmake/device/intel_fpga.cmake @@ -0,0 +1,48 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_INTEL_FPGA) + return() +endif() + +if(NOT DEFINED INTEL_FPGA_SDK_ROOT) + set(INTEL_FPGA_SDK_ROOT $ENV{INTEL_FPGA_SDK_ROOT}) + if(NOT INTEL_FPGA_SDK_ROOT) + message(FATAL_ERROR "Must set INTEL_FPGA_SDK_ROOT or env INTEL_FPGA_SDK_ROOT when LITE_WITH_INTEL_FPGA=ON") + endif() +endif() + +message(STATUS "INTEL_FPGA_SDK_ROOT: ${INTEL_FPGA_SDK_ROOT}") + +set(INTEL_FPGA_SDK_INC "${INTEL_FPGA_SDK_ROOT}/include") +set(INTEL_FPGA_SDK_LIB "${INTEL_FPGA_SDK_ROOT}/lib/libvnna.so") + +include_directories("${INTEL_FPGA_SDK_INC}") + +find_library(INTEL_FPGA_LIB_FILE NAMES vnna + PATHS ${INTEL_FPGA_SDK_ROOT}/lib) + +if(NOT INTEL_FPGA_LIB_FILE) + message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}") +else() + message(STATUS "Found INTEL_FPGA VNNA Library: ${INTEL_FPGA_LIB_FILE}") + add_library(vnna SHARED IMPORTED) + set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_LIB_FILE}) +endif() + +#link_directories("${INTEL_FPGA_SDK_ROOT}/lib") +#add_library(vnna SHARED IMPORTED GLOBAL) +#set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB}) + +set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs") diff --git a/lite/backends/intel_fpga/CMakeLists.txt b/lite/backends/intel_fpga/CMakeLists.txt index c47a33be007..24a8044d240 100644 --- a/lite/backends/intel_fpga/CMakeLists.txt +++ b/lite/backends/intel_fpga/CMakeLists.txt @@ -2,22 +2,4 @@ if (NOT LITE_WITH_INTEL_FPGA) return() endif() -set(LITE_INTEL_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga") -set(LITE_INTEL_FPGA_LLDRV_PATH "${PADDLE_SOURCE_DIR}/lite/backends/intel_fpga/lldrv") - -message("intel_fpga_path ${LITE_INTEL_FPGA_PATH}") -file(GLOB INTEL_FPGA_CPP "${LITE_INTEL_FPGA_PATH}/*.cpp") -file(GLOB LLDRV_CPP "${LITE_INTEL_FPGA_LLDRV_PATH}/*.cpp") -message("intel_fpga cpp: ${INTEL_FPGA_CPP}") -set(INTEL_FPGA_ALL_CPP "") -FOREACH(FILE_PATH ${LLDRV_CPP}) - STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) - list(APPEND INTEL_FPGA_ALL_CPP lldrv/${FILE_NAME}) -ENDFOREACH(FILE_PATH) -FOREACH(FILE_PATH ${INTELFPGA_CPP}) - STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) - list(APPEND INTEL_FPGA_ALL_CPP ${FILE_NAME}) -ENDFOREACH(FILE_PATH) -message("intel_fpga src: ${INTEL_FPGA_ALL_CPP}") -cc_library(kernel_intel_fpga SRCS ${INTEL_FPGA_ALL_CPP}) -cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS kernel_intel_fpga) +lite_cc_library(intel_fpga_target_wrapper SRCS target_wrapper.cpp DEPS ${intel_fpga_runtime_libs}) diff --git a/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp b/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp deleted file mode 100644 index cc188e483cf..00000000000 --- a/lite/backends/intel_fpga/lldrv/intelfpgadrv.cpp +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2020 AWCloud. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" - -namespace paddle { -namespace lite { -namespace intel_fpga { - -/// FD of intel_fpga -static int intel_fpga_fd = -1; - -/// Memory blocks -static struct intel_fpga_memblk_s mb, ms, mi, mk, mo; - -int intel_fpga_open() { - if (intel_fpga_fd < 0) { - intel_fpga_fd = open("/dev/intelfpgadrv0", O_RDWR); - if (intel_fpga_fd < 0) { - return -1; - } - memset(&mb, 0, sizeof(mb)); - memset(&ms, 0, sizeof(ms)); - memset(&mi, 0, sizeof(mi)); - memset(&mk, 0, sizeof(mk)); - memset(&mo, 0, sizeof(mo)); - } - - return 0; -} - -void intel_fpga_close() { - if (intel_fpga_fd < 0) return; - - if (mb.addr) { - free(mb.addr); - } - if (ms.addr) { - free(ms.addr); - } - if (mi.addr) { - free(mi.addr); - } - if (mk.addr) { - free(mk.addr); - } - if (mo.addr) { - free(mo.addr); - } - close(intel_fpga_fd); - intel_fpga_fd = -1; -} - -/// memory management; -void* intel_fpga_malloc(size_t size) { return malloc(size); } - -void intel_fpga_free(void* ptr) { free(ptr); } - -void* intel_fpga_mbias(size_t size) { - if (mb.addr) { - if (mb.size >= size) { - return mb.addr; - } - free(mb.addr); - } - mb.addr = malloc(size); - if (mb.addr) { - mb.size = size; - } - return mb.addr; -} - -void* intel_fpga_mscale(size_t size) { - if (ms.addr) { - if (ms.size >= size) { - return ms.addr; - } - free(ms.addr); - } - ms.addr = malloc(size); - if (ms.addr) { - ms.size = size; - } - - return ms.addr; -} - -void* intel_fpga_minput(size_t size) { - if (mi.addr) { - if (mi.size >= size) { - return mi.addr; - } - free(mi.addr); - } - mi.addr = malloc(size); - if (mi.addr) { - mi.size = size; - } - - return mi.addr; -} - -void* intel_fpga_mkernel(size_t size) { - if (mk.addr) { - if (mk.size >= size) { - return mk.addr; - } - free(mk.addr); - } - mk.addr = malloc(size); - if (mk.addr) { - mk.size = size; - } - - return mk.addr; -} - -void* intel_fpga_moutput(size_t size) { - if (mo.addr) { - if (mo.size >= size) { - return mo.addr; - } - free(mo.addr); - } - mo.addr = malloc(size); - if (mo.addr) { - mo.size = size; - } - - return mo.addr; -} - -void intel_fpga_copy(void* dst, void* src, int size) { memcpy(dst, src, size); } - -int intel_fpga_info(struct intel_fpga_info_s* args) { - int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_INFO); - - if (intel_fpga_open()) return -1; - - return ioctl(intel_fpga_fd, cmd, args); -} - -int intel_fpga_conv(struct intel_fpga_conv_s* args) { - int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_CONV); - - if (intel_fpga_open()) return -1; - - return ioctl(intel_fpga_fd, cmd, args); -} - -int intel_fpga_pooling(struct intel_fpga_pool_s* args) { - int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_POOL); - - if (intel_fpga_open()) return -1; - - return ioctl(intel_fpga_fd, cmd, args); -} - -int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args) { - int cmd = INTEL_FPGA_IOCTL_MAKE(INTEL_FPGA_CMD_FCON); - - if (intel_fpga_open()) return -1; - - return ioctl(intel_fpga_fd, cmd, args); -} - -} // namespace intel_fpga -} // namespace lite -} // namespace paddle diff --git a/lite/backends/intel_fpga/lldrv/intelfpgadrv.h b/lite/backends/intel_fpga/lldrv/intelfpgadrv.h deleted file mode 100644 index 0a162e7af9d..00000000000 --- a/lite/backends/intel_fpga/lldrv/intelfpgadrv.h +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2020 AWCloud. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _LLDRV_INTEL_FPGA_H_ -#define _LLDRV_INTEL_FPGA_H_ - -#pragma once - -#include -#include -#include -#include - -namespace paddle { -namespace lite { -namespace intel_fpga { - -// Activation type -enum intel_fpga_act_e { - ACT_NONE = 0, - ACT_RELU = 1, -}; - -// Device information -struct intel_fpga_info_s { - uint32_t ver; // Version, 00.00.0000 -}; - -struct intel_fpga_reset_s { - uint32_t val; // reset command, N/A -}; - -// Memory copy -struct intel_fpga_mcopy_s { - void* src; // source address - void* dst; // destination adddress - size_t size; // size in bytes -}; - -// Memory block -struct intel_fpga_memblk_s { - void* addr; // base address - size_t size; // size in bytes -}; - -// Kernel -struct intel_fpga_kernel_s { - uint32_t kw; // width - uint32_t kh; // height - uint32_t ws; // width stride(s) - uint32_t hs; // height stride(s) -}; - -// Input parameters, nchw -struct intel_fpga_input_s { - uint32_t in; // nbr of batch {1} - uint32_t ic; // nbr of channels {1} - uint32_t iw; // width - uint32_t ih; // height - uint32_t pl; // padding x in bytes {0} - uint32_t pr; // padding x in bytes {0} - uint32_t pt; // padding y in bytes {0} - uint32_t pb; // padding y in bytes {0} - uint32_t dx; // dilation for x {1} - uint32_t dy; // dilation for y {1} -}; - -// Output parameters, nchw -struct intel_fpga_output_s { - uint32_t on; // nbr of batch {1} - uint32_t oc; // nbr of channels {1} - uint32_t ow; // width - uint32_t oh; // height -}; - -// Basic convolution -struct intel_fpga_conv_s { - uint32_t at; // activation type {0}, None=0, RELU=1 - uint32_t ng; // nbr of groups {1} - int8_t* ia; // input address, INT8[N,Ci,Hi,Wi] - int8_t* ka; // kernel address, INT32[Co,Ci,Hk,Wk] - int32_t* ba; // bias address, INT32[Co,1] - int32_t* oa; // output address, INT32[N,Co,Ho,Wo] - struct intel_fpga_input_s i; // input - struct intel_fpga_kernel_s k; // kernel - struct intel_fpga_output_s o; // output -}; - -// Pooling convolution -struct intel_fpga_pool_s { - uint32_t gp : 1; // global pooling {0} - uint32_t pm : 1; // pooling mode {0}, Max=0, AVG=1 - uint32_t cm : 1; // ceil mode {0}, ceil=0, floor=1 - uint32_t ex : 1; // exclusive {1}, if ignore padding in avg pooling - uint32_t reserved : 28; // reserved {0} - int32_t* ia; // input address, INT32[N,Ci,Hi,Wi] - int32_t* oa; // output address, INT32[N,Ci,Ho,Wo] - struct intel_fpga_input_s i; // input - struct intel_fpga_kernel_s k; // kernel - struct intel_fpga_output_s o; // output -}; - -// Full connection -struct intel_fpga_fcon_s { - uint32_t at; // activation type {0}, None=0, RELU=1 - int8_t* ia; // input address, INT8[M,K] - int8_t* ka; // kernel address, INT8[K,N] - int32_t* ba; // bias address, INT32[M,N] - int32_t* oa; // output address, INT32[M,N] = ia[M,K] * wa[K,N] + ba[M,N] - int m, n, k; // dims -}; - -// Regisger access -struct intel_fpga_creg_s { - uint32_t addr; - uint32_t data; -}; - -#define INTEL_FPGA_MAGIC_ID (('A' + 'L' + 'T' + 'R') / 4) - -/* Ioctls */ -#define INTEL_FPGA_IOCTL_MAKE(cmd) (_IO(INTEL_FPGA_MAGIC_ID, cmd)) -#define INTEL_FPGA_IOCTL_GET(cmd) (_IOC_NR(cmd)) -#define INTEL_FPGA_IOCTL_VALID(cmd) \ - ((_IOC_TYPE(cmd) == INTEL_FPGA_MAGIC_ID) ? 1 : 0) - -#define INTEL_FPGA_CMD_INFO 0x00 // struct intel_fpga_info_s -#define INTEL_FPGA_CMD_RESET 0x01 // struct intel_fpga_reset_s - -#define INTEL_FPGA_CMD_MCOPY 0x10 // struct intel_fpga_mcopy_s -#define INTEL_FPGA_CMD_INVAL 0x11 // struct intel_fpga_cache_s -#define INTEL_FPGA_CMD_FLUSH 0x12 // struct intel_fpga_cache_s - -#define INTEL_FPGA_CMD_CONV 0x20 // struct intel_fpga_conv_s -#define INTEL_FPGA_CMD_POOL 0x21 // struct intel_fpga_pool_s -#define INTEL_FPGA_CMD_FCON 0x22 // struct intel_fpga_fcon_s - -#define INTEL_FPGA_CMD_REGRD 0xC0 // struct intel_fpga_register_s -#define INTEL_FPGA_CMD_REGWR 0xC1 // struct intel_fpga_register_s - -//--------------------------------------------------------------------------- - -// device open/close -int intel_fpga_open(); -void intel_fpga_close(); - -void intel_fpga_reset(struct intel_fpga_reset_s* args); - -// memory management -void* intel_fpga_malloc(size_t size); -void intel_fpga_free(void* ptr); - -void* intel_fpga_mbias(size_t size); -void* intel_fpga_mscale(size_t size); -void* intel_fpga_minput(size_t size); -void* intel_fpga_mkernel(size_t size); -void* intel_fpga_moutput(size_t size); - -void intel_fpga_copy(void* dst, void* src, int size); -int intel_fpga_flush(void* addr, size_t size); -int intel_fpga_invalidate(void* addr, size_t size); - -// device information -int intel_fpga_info(struct intel_fpga_info_s* args); - -// convolution process -int intel_fpga_conv(struct intel_fpga_conv_s* args); -int intel_fpga_pooling(struct intel_fpga_pool_s* args); -int intel_fpga_fullconnect(struct intel_fpga_fcon_s* args); - -} // namespace intel_fpga -} // namespace lite -} // namespace paddle - -#endif // _LLDRV_INTEL_FPGA_H_ diff --git a/lite/backends/intel_fpga/lldrv/utils.cpp b/lite/backends/intel_fpga/lldrv/utils.cpp deleted file mode 100644 index 380e79e4d31..00000000000 --- a/lite/backends/intel_fpga/lldrv/utils.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2020 AWCloud. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "lite/backends/intel_fpga/lldrv/utils.h" - -namespace paddle { -namespace lite { -namespace intel_fpga { - -float find_max(const float* data, int size) { - float max = 0.0; - - for (size_t i = 0; i < size; ++i) { - float value = data[i]; - float abs = value > 0.0 ? value : -value; - - max = std::max(max, abs); - } - - return max; -} - -void quantize_s8(const float* src, int8_t* dst, int size, float factor) { - float fdata; - - for (size_t i = 0; i < size; i++) { - fdata = src[i] * factor; - - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - - dst[i] = (int8_t)fdata; - } -} - -void quantize_s32(const float* src, int32_t* dst, int size, float factor) { - float fdata; - - for (size_t i = 0; i < size; i++) { - fdata = src[i] * factor; - - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - - dst[i] = (int32_t)fdata; - } -} -} // namespace intel_fpga -} // namespace lite -} // namespace paddle diff --git a/lite/backends/intel_fpga/lldrv/utils.h b/lite/backends/intel_fpga/lldrv/utils.h deleted file mode 100644 index ad8e403afd8..00000000000 --- a/lite/backends/intel_fpga/lldrv/utils.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2020 AWCloud. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -namespace paddle { -namespace lite { -namespace intel_fpga { - -float find_max(const float* data, int size); - -void quantize_s8(const float* src, int8_t* dst, int size, float factor); -void quantize_s32(const float* src, int32_t* dst, int size, float factor); - -} // namespace intel_fpga -} // namespace lite -} // namespace paddle diff --git a/lite/backends/intel_fpga/target_wrapper.cpp b/lite/backends/intel_fpga/target_wrapper.cpp index 0d567016c91..89d6bee61d5 100644 --- a/lite/backends/intel_fpga/target_wrapper.cpp +++ b/lite/backends/intel_fpga/target_wrapper.cpp @@ -13,19 +13,16 @@ // limitations under the License. #include "lite/backends/intel_fpga/target_wrapper.h" -#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" #include "lite/utils/all.h" namespace paddle { namespace lite { void* TargetWrapper::Malloc(size_t size) { - return intel_fpga::intel_fpga_malloc(size); + return intelfpga_malloc(size); } -void TargetWrapper::Free(void* ptr) { - intel_fpga::intel_fpga_free(ptr); -} +void TargetWrapper::Free(void* ptr) { intelfpga_free(ptr); } void TargetWrapper::MemcpySync(void* dst, const void* src, diff --git a/lite/backends/intel_fpga/target_wrapper.h b/lite/backends/intel_fpga/target_wrapper.h index ee60348f10f..e91bc7c5f6e 100644 --- a/lite/backends/intel_fpga/target_wrapper.h +++ b/lite/backends/intel_fpga/target_wrapper.h @@ -15,6 +15,7 @@ #pragma once #include +#include "intelfpga.h" // NOLINT #include "lite/core/target_wrapper.h" namespace paddle { diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt index 276f4cb7e54..f7747dddeb6 100755 --- a/lite/kernels/intel_fpga/CMakeLists.txt +++ b/lite/kernels/intel_fpga/CMakeLists.txt @@ -2,8 +2,10 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_W return() endif() -set(intel_fpga_deps intel_fpga_target_wrapper kernel_intel_fpga) +set(intel_fpga_deps intel_fpga_target_wrapper) + +#lite_cc_library(kernel_intel_fpga_vnna SRCS conv_depthwise.cc conv_gemmlike.cc DEPS ${intel_fpga_runtime_libs}) add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps}) -add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps}) +add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps} ${intel_fpga_runtime_libs}) add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga) diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index 0857df30bbd..0a62f90793a 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -5,11 +5,11 @@ set -e # 1. global variables, you can change them according to your requirements ##################################################################################################### # armv8 or armv7hf or armv7, default armv8. -ARCH=armv8 +ARCH=armv7hf # gcc or clang, default gcc. TOOLCHAIN=gcc # ON or OFF, default OFF. -WITH_EXTRA=OFF +WITH_EXTRA=ON # controls whether to compile python lib, default is OFF. WITH_PYTHON=OFF PY_VERSION="" @@ -34,7 +34,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk" WITH_BAIDU_XPU=OFF BAIDU_XPU_SDK_ROOT="" # options of compiling intel fpga. -WITH_INTEL_FPGA=OFF +WITH_INTEL_FPGA=ON +INTEL_FPGA_SDK_ROOT="$(pwd)/intelfpga_sdk" # options of adding training ops WITH_TRAIN=OFF # num of threads used during compiling.. @@ -77,8 +78,9 @@ function init_cmake_mutable_options { -DXPU_SDK_ROOT=$BAIDU_XPU_SDK_ROOT \ -DLITE_WITH_TRAIN=$WITH_TRAIN \ -DLITE_WITH_IMAGINATION_NNA=$WITH_IMAGINATION_NNA \ + -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT} \ -DLITE_WITH_INTEL_FPGA=$WITH_INTEL_FPGA \ - -DIMAGINATION_NNA_SDK_ROOT=${IMAGINATION_NNA_SDK_ROOT}" + -DINTEL_FPGA_SDK_ROOT=${INTEL_FPGA_SDK_ROOT}" } ##################################################################################################### @@ -349,6 +351,10 @@ function main { WITH_INTEL_FPGA="${i#*=}" shift ;; + --intel_fpga_sdk_root=*) + INTEL_FPGA_SDK_ROOT="${i#*=}" + shift + ;; # ON or OFF, default OFF --with_train=*) WITH_TRAIN="${i#*=}" From 4c3196ef6deebeb696ef6b536fb3a1b9ad951fb8 Mon Sep 17 00:00:00 2001 From: xbeu Date: Tue, 23 Mar 2021 12:42:10 +0000 Subject: [PATCH 07/19] test=develop --- cmake/device/intel_fpga.cmake | 23 +++-- lite/CMakeLists.txt | 4 + lite/api/CMakeLists.txt | 11 ++- lite/backends/CMakeLists.txt | 2 +- lite/kernels/intel_fpga/CMakeLists.txt | 10 +- lite/kernels/intel_fpga/conv_gemmlike.cc | 119 ++++++++--------------- lite/kernels/intel_fpga/conv_gemmlike.h | 1 + 7 files changed, 71 insertions(+), 99 deletions(-) diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake index 498f58bfbdc..afde0d8796c 100644 --- a/cmake/device/intel_fpga.cmake +++ b/cmake/device/intel_fpga.cmake @@ -25,24 +25,23 @@ endif() message(STATUS "INTEL_FPGA_SDK_ROOT: ${INTEL_FPGA_SDK_ROOT}") -set(INTEL_FPGA_SDK_INC "${INTEL_FPGA_SDK_ROOT}/include") -set(INTEL_FPGA_SDK_LIB "${INTEL_FPGA_SDK_ROOT}/lib/libvnna.so") +find_path(INTEL_FPGA_SDK_INC NAMES intelfpga.h + PATHS ${INTEL_FPGA_SDK_ROOT}/include NO_DEFAULT_PATH) +if (NOT INTEL_FPGA_SDK_INC) + message(FATAL_ERROR "Can not find intelfpga.h in ${INTEL_FPGA_SDK_INC}/include") +endif() include_directories("${INTEL_FPGA_SDK_INC}") -find_library(INTEL_FPGA_LIB_FILE NAMES vnna +find_library(INTEL_FPGA_SDK_LIB NAMES vnna PATHS ${INTEL_FPGA_SDK_ROOT}/lib) -if(NOT INTEL_FPGA_LIB_FILE) - message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}") +if(NOT INTEL_FPGA_SDK_LIB) + message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib") else() - message(STATUS "Found INTEL_FPGA VNNA Library: ${INTEL_FPGA_LIB_FILE}") - add_library(vnna SHARED IMPORTED) - set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_LIB_FILE}) + message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}") + add_library(vnna SHARED IMPORTED GLOBAL) + set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB}) endif() -#link_directories("${INTEL_FPGA_SDK_ROOT}/lib") -#add_library(vnna SHARED IMPORTED GLOBAL) -#set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB}) - set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 732cd2d29d3..4a4f8af5848 100755 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -11,6 +11,7 @@ message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") +message(STATUS "LITE_WITH_INTEL_FPGA:\t${LITE_WITH_INTEL_FPGA}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") @@ -133,6 +134,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_IMAGINATION_NNA) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna") endif(LITE_WITH_IMAGINATION_NNA) + if (LITE_WITH_INTEL_FPGA) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intelfpga") + endif(LITE_WITH_INTEL_FPGA) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 64b68cc0c02..ee2a40cb691 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -37,7 +37,6 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH if(LITE_WITH_CV) target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive") endif(LITE_WITH_CV) - #light api dynamic library lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc DEPS ${light_lib_DEPS} @@ -47,6 +46,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH APU_DEPS ${apu_kernels} RKNPU_DEPS ${rknpu_kernels} IMAGINATION_NNA_DEPS ${imagination_nna_kernels} + INTEL_FPGA_DEPS ${intel_fpga_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ) @@ -177,10 +177,6 @@ if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) set(cxx_api_deps ${cxx_api_deps} ${fpga_deps}) endif() -if(LITE_WITH_INTEL_FPGA) - set(light_api_deps ${light_api_deps} ${intel_fpga_deps}) - set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps}) -endif() if(LITE_WITH_BM) set(light_api_deps ${light_api_deps} ${bm_deps}) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) @@ -196,6 +192,11 @@ if(LITE_WITH_IMAGINATION_NNA) set(cxx_api_deps ${cxx_api_deps} ${imagination_nna_deps}) endif() +if(LITE_WITH_INTEL_FPGA) + set(light_api_deps ${light_api_deps} ${intel_fpga_deps}) + set(cxx_api_deps ${cxx_api_deps} ${intel_fpga_deps}) +endif() + if(LITE_WITH_HUAWEI_ASCEND_NPU) set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps}) set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps}) diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 7c05e6138f1..c15d07c0904 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -12,4 +12,4 @@ add_subdirectory(apu) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) -add_subdirectory(intel_fpga) +#add_subdirectory(intel_fpga) diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt index f7747dddeb6..748a6f08c9d 100755 --- a/lite/kernels/intel_fpga/CMakeLists.txt +++ b/lite/kernels/intel_fpga/CMakeLists.txt @@ -2,10 +2,12 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_W return() endif() -set(intel_fpga_deps intel_fpga_target_wrapper) +set(intel_fpga_deps ${lite_kernel_deps} ${intel_fpga_runtime_libs}) -#lite_cc_library(kernel_intel_fpga_vnna SRCS conv_depthwise.cc conv_gemmlike.cc DEPS ${intel_fpga_runtime_libs}) +#lite_cc_library(dwconv_intel_fpga SRCS conv_depthwise.cc DEPS ${lite_kernel_deps}) +#lite_cc_library(gmconv_intel_fpga SRCS conv_gemmlike.cc DEPS ${lite_kernel_deps}) +#set(conv_intel_fpga ${intel_fpga_runtime_libs} dwconv_intel_fpga gmconv_intel_fpga) add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps}) -add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps} ${intel_fpga_runtime_libs}) -add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga) +add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps}) +add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} ${conv_depthwise_intel_fpga} ${conv_gemmlike_intel_fpga}) diff --git a/lite/kernels/intel_fpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc index bc9b6f68014..849dabc3dcf 100644 --- a/lite/kernels/intel_fpga/conv_gemmlike.cc +++ b/lite/kernels/intel_fpga/conv_gemmlike.cc @@ -16,8 +16,6 @@ #include #include "lite/backends/arm/math/gemm_prepacked_int8.h" #include "lite/backends/arm/math/packed_sgemm.h" -#include "lite/backends/intel_fpga/lldrv/intelfpgadrv.h" -#include "lite/backends/intel_fpga/lldrv/utils.h" namespace paddle { namespace lite { @@ -67,84 +65,51 @@ void GemmLikeConv::Run() { if (kh > 1 && kw > 1) { int i, j, il, kl, ol, l, m, n, k; - lite::intel_fpga::intel_fpga_conv_s conv; + intelfpga_conv2d_s conv; conv.at = static_cast(param.activation_param.active_type); - if (conv.at == 4) { - alpha = param.activation_param.Leaky_relu_alpha; - } conv.ng = param.groups; - - conv.i.in = i_dims[0]; - conv.i.ic = i_dims[1]; - conv.i.ih = i_dims[2]; - conv.i.iw = i_dims[3]; - conv.i.pl = paddings[2]; // left - conv.i.pr = paddings[3]; // right - conv.i.pt = paddings[0]; // top - conv.i.pb = paddings[1]; // bottom - conv.i.dy = dilations[0]; - conv.i.dx = dilations[1]; - - conv.k.kh = w_dims[2]; - conv.k.kw = w_dims[3]; - conv.k.hs = param.strides[0]; - conv.k.ws = param.strides[1]; - - conv.o.on = o_dims[0]; - conv.o.oc = o_dims[1]; - conv.o.oh = o_dims[2]; - conv.o.ow = o_dims[3]; - - il = conv.i.in * conv.i.ic * conv.i.ih * conv.i.iw; - kl = conv.o.oc * conv.i.ic * conv.k.kh * conv.k.kw; - ol = conv.o.on * conv.o.oc * conv.o.oh * conv.o.ow; - conv.ia = static_cast( - lite::intel_fpga::intel_fpga_minput(il * sizeof(int8_t))); - conv.ka = static_cast( - lite::intel_fpga::intel_fpga_mkernel(kl * sizeof(int8_t))); - conv.oa = static_cast( - lite::intel_fpga::intel_fpga_moutput(ol * sizeof(int32_t))); - if (conv.ia && conv.ka && conv.oa) { - float fd = lite::intel_fpga::find_max(i_data, il); - float fw = lite::intel_fpga::find_max(w_data, kl); - - fd = 127.0 / fd; - fw = 127.0 / fw; - - // y = 127.0 / fmax - // y = x * scale; - lite::intel_fpga::quantize_s8(i_data, conv.ia, il, fd); - lite::intel_fpga::quantize_s8(w_data, conv.ka, kl, fw); - - // perform conv2d - if (lite::intel_fpga::intel_fpga_conv(&conv)) { - std::cout << "intel_fpga_conv error" << std::endl; - } - // Convert int32 back to fp32, [n,c,h,w] - // 1. y = x / scale - // 2. y = x + b - // 3. y = f(x) - int hw = conv.o.oh * conv.o.ow; - for (i = 0; i < conv.o.on; i++) { - for (j = 0; j < conv.o.oc; j++) { - m = i * conv.o.oc + j; - n = m * hw; - for (l = 0; l < hw; l++) { - k = n + l; - o_data[k] = static_cast(conv.oa[k] / fd / fw); - if (b_data) o_data[k] += b_data[j]; - if (conv.at == 1) { // relu - o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0; - } else if (conv.at == 2) { // relu6 - o_data[k] = o_data[k] > 0.0 ? o_data[k] : 0.0; - o_data[k] = o_data[k] > 6.0 ? 6.0 : o_data[k]; - } else if (conv.at == 4) { // leakyRelu - if (o_data[k] < 0.0) o_data[k] = o_data[k] * alpha; - } - } - } - } + switch (conv.at) { + case 1: + conv.at = INTELFPGA_ACT_RELU; + break; + case 2: + conv.at = INTELFPGA_ACT_RELU6; + break; + case 4: + conv.at = INTELFPGA_ACT_LEAKYRELU; + conv.alpha = param.activation_param.Leaky_relu_alpha; + break; + default: + conv.at = INTELFPGA_ACT_NONE; + break; + } + conv.ia = const_cast(i_data); + conv.ka = const_cast(w_data); + conv.ba = const_cast(b_data); + conv.oa = const_cast(o_data); + conv.ip.in = i_dims[0]; + conv.ip.ic = i_dims[1]; + conv.ip.ih = i_dims[2]; + conv.ip.iw = i_dims[3]; + conv.ip.pl = paddings[2]; // left + conv.ip.pr = paddings[3]; // right + conv.ip.pt = paddings[0]; // top + conv.ip.pb = paddings[1]; // bottom + conv.ip.dy = dilations[0]; + conv.ip.dx = dilations[1]; + + conv.kp.kh = w_dims[2]; + conv.kp.kw = w_dims[3]; + conv.kp.hs = param.strides[0]; + conv.kp.ws = param.strides[1]; + + conv.op.on = o_dims[0]; + conv.op.oc = o_dims[1]; + conv.op.oh = o_dims[2]; + conv.op.ow = o_dims[3]; + if (intelfpga_conv2d(&conv)) { + std::cout << "intel_fpga_conv error" << std::endl; } } else { if (flag_1x1gemm_) { diff --git a/lite/kernels/intel_fpga/conv_gemmlike.h b/lite/kernels/intel_fpga/conv_gemmlike.h index 338a711983c..bad897c3800 100644 --- a/lite/kernels/intel_fpga/conv_gemmlike.h +++ b/lite/kernels/intel_fpga/conv_gemmlike.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include From 1428da369c39525b9e6aa8d203149ca392f9aa49 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 24 Mar 2021 03:35:30 +0000 Subject: [PATCH 08/19] test=develop --- cmake/device/intel_fpga.cmake | 3 +-- lite/api/CMakeLists.txt | 16 ++++++++++++++++ lite/backends/CMakeLists.txt | 2 +- lite/kernels/intel_fpga/CMakeLists.txt | 6 +----- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake index afde0d8796c..e753b88a5ca 100644 --- a/cmake/device/intel_fpga.cmake +++ b/cmake/device/intel_fpga.cmake @@ -40,8 +40,7 @@ if(NOT INTEL_FPGA_SDK_LIB) message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib") else() message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}") - add_library(vnna SHARED IMPORTED GLOBAL) - set_property(TARGET vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB}) + link_directories(${INTEL_FPGA_SDK_ROOT}/lib) endif() set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index ee2a40cb691..a061fa0234d 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -37,6 +37,10 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH if(LITE_WITH_CV) target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive") endif(LITE_WITH_CV) + if (LITE_WITH_INTEL_FPGA) + # Need to add INTEL_FPGA runtime libs dependency + target_link_libraries(paddle_full_api_shared ${intel_fpga_runtime_libs}) + endif(LITE_WITH_INTEL_FPGA) #light api dynamic library lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc DEPS ${light_lib_DEPS} @@ -138,6 +142,10 @@ else() #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs}) endif() + if (LITE_WITH_INTEL_FPGA) + # Need to add INTEL_FPGA runtime libs dependency + target_link_libraries(paddle_light_api_shared ${intel_fpga_runtime_libs}) + endif(LITE_WITH_INTEL_FPGA) # 3. produce java lib from `PADDLELITE_OBJS` if LITE_WITH_JAVA=ON if (LITE_WITH_JAVA) add_library(paddle_lite_jni SHARED $ android/jni/native/paddle_lite_jni.cc android/jni/native/tensor_jni.cc) @@ -467,8 +475,16 @@ if (NOT LITE_ON_TINY_PUBLISH) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) target_link_libraries(paddle_api_full ${cuda_deps}) + if (LITE_WITH_INTEL_FPGA) + # Need to add INTEL_FPGA runtime libs dependency + target_link_libraries(paddle_api_full ${intel_fpga_runtime_libs}) + endif() get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api) + if (LITE_WITH_INTEL_FPGA) + # Need to add INTEL_FPGA runtime libs dependency + target_link_libraries(paddle_api_light ${intel_fpga_runtime_libs}) + endif() endif() #----------------------------------------------------------------------------------------------------- diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index c15d07c0904..7c05e6138f1 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -12,4 +12,4 @@ add_subdirectory(apu) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) add_subdirectory(imagination_nna) -#add_subdirectory(intel_fpga) +add_subdirectory(intel_fpga) diff --git a/lite/kernels/intel_fpga/CMakeLists.txt b/lite/kernels/intel_fpga/CMakeLists.txt index 748a6f08c9d..7f1979b4543 100755 --- a/lite/kernels/intel_fpga/CMakeLists.txt +++ b/lite/kernels/intel_fpga/CMakeLists.txt @@ -4,10 +4,6 @@ endif() set(intel_fpga_deps ${lite_kernel_deps} ${intel_fpga_runtime_libs}) -#lite_cc_library(dwconv_intel_fpga SRCS conv_depthwise.cc DEPS ${lite_kernel_deps}) -#lite_cc_library(gmconv_intel_fpga SRCS conv_gemmlike.cc DEPS ${lite_kernel_deps}) -#set(conv_intel_fpga ${intel_fpga_runtime_libs} dwconv_intel_fpga gmconv_intel_fpga) - add_kernel(conv_depthwise_intel_fpga INTEL_FPGA basic SRCS conv_depthwise.cc DEPS ${intel_fpga_deps}) add_kernel(conv_gemmlike_intel_fpga INTEL_FPGA basic SRCS conv_gemmlike.cc DEPS ${intel_fpga_deps}) -add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} ${conv_depthwise_intel_fpga} ${conv_gemmlike_intel_fpga}) +add_kernel(conv_compute_intel_fpga INTEL_FPGA basic SRCS conv_compute.cc DEPS ${intel_fpga_deps} conv_depthwise_intel_fpga conv_gemmlike_intel_fpga) From 4f11189b99c59ae3114eefa45cde2013c7103fa8 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 24 Mar 2021 05:18:36 +0000 Subject: [PATCH 09/19] test=develop --- docs/demo_guides/intel_fpga.md | 234 ++++++++++++++++++++------------- 1 file changed, 141 insertions(+), 93 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index b76920bd134..2d4dc721e53 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -1,107 +1,155 @@ -# PaddleLite使用IntelFPGA预测部署 +# PaddleLite使用英特尔FPGA预测部署 -Paddle Lite支持基于arm的IntelFPGA C5的模型预测,提供armv7hf的交叉编译 +PaddleLite已支持英特尔FPGA平台的预测部署,PaddleLite通过调用底层驱动实现对FPGA硬件的调度。 -PaddleLite通过调用底层驱动实现对FPGA硬件的调度,以及对应的API接口。 +## PaddleLite实现英特尔FPGA简介 -## Lite实现IntelFPGA简介 +PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特性如下: -Lite支持IntelFPGA作为后端硬件进行模型推理,其主要特性如下: +- PaddleLite中英特尔FPGA的kernel,weights和bias仍为FP32、NCHW的格式,在提升计算速度的同时能做到用户对数据格式无感知 +- 对于英特尔FPGA暂不支持的kernel,均会切回arm端运行,实现arm+FPGA混合布署运行 +- 目前英特尔FPGA成本功耗都较低,可作为边缘设备首选硬件 -- Lite中IntelFPGA的kernel均以FP32、NCHW的格式作为输入输出格式 +## 支持现状 -- 对于IntelFPGA暂不支持的kernel,均会切回ARM端运行,实现ARM+FPGA混合布署运行 +### 已支持的芯片 -## 支持芯片 -- [Cyclone V](https://www.intel.cn/content/dam/altera-www/global/en_US/pdfs/literature/hb/cyclone-v/cv_51002.pdf) +- 英特尔FPGA Cyclone V系列芯片 -### 已支持(或部分支持)的Paddle算子 +### 已支持的设备 -- relu/relu6/leakyrelu -- conv2d -- depthwise_conv2d +- 海运捷讯C5MB开发板 ### 已支持的Paddle模型 -- [SSD_MobileNet_V1](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_coco_pretrained.tar) - -## 编译 - -需要提前准备带有intelfpgadrv.ko的IntelFPGA开发板C5MB/C5TB和Lite代码 - -CMAKE编译选项: - -- 设置`LITE_WITH_INTEL_FPGA=ON`和`LITE_WITH_ARM=ON` - -其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile/compile_linux)。 - -示例如下: -```shell - cmake .. \ - -DWITH_GPU=OFF \ - -DWITH_MKL=OFF \ - -DWITH_LITE=ON \ - -DLITE_WITH_CUDA=OFF \ - -DLITE_WITH_X86=OFF \ - -DLITE_WITH_ARM=ON \ - -DLITE_WITH_OPENMP=ON \ - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ - -DWITH_TESTING=OFF \ - -DLITE_WITH_INTEL_FPGA=ON \ - -DARM_TARGET_OS=armlinux - make publish_inference -j2 -``` -Lite提供IntelFPGA编译脚本,位于lite/tools/build_intel_fpga.sh full_publish,在Lite根目录执行该脚本即可编译 - -## 运行示例 - -- **运行文件准备** - -下面以SSD模型为例,介绍如何使用C5MB/C5TB开发板实现模型运行 +- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/ssd_mobilenet_v1.tar.gz) -```bash -#打开串口调试工具,如Putty或SecureCRT,选择对应的调试串口,并设置串口属性, -#波特率:115200,数据位:8,停止位:1,奇偶校验:无[主机上执行] -#上电C5MB开发板,并在串口调试工具中登录 -awcloud login: root -Password: #密码:Awcloud@123 -#进入/opt目录[开发板执行] -cd /opt -#在运行模型前需要加载FPGA驱动[开发板执行] -insmod driver/intelfpgadrv.ko -``` - -- **使用IntelFPGA进行模型预测** - -```bash -#以下命令均在开发板上运行,在开发板上已经部署了对应的输入图片,模型,驱动程序,执行程序等 -#运行SSD测试程序,输入图片为/opt/images/dog.jpg,输出图片为/opt/dog_result.jpg -./run_ssd.sh -``` - -## 如何在Code中使用 - -在Lite中使用IntelFPGA与ARM相似,具体的区别如下: - -- 由于IntelFPGA运行模式为FP32精度、NCHW布局,所以需要修改相应的`valid_place` - -代码示例: -```cpp -lite::Predictor predictor; -std::vector valid_places( - {Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)},Place{TARGET(kARM)}); - -predictor.Build(model_dir, "", "", valid_places); - -auto* input_tensor = predictor.GetInput(0); -input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); -auto* data = input_tensor->mutable_data(); -auto item_size = input_tensor->dims().production(); -//假设设置输入数据全为1 -for (int i = 0; i < item_size; i++) { - data[i] = 1; -} +### 已支持(或部分支持)的Paddle算子 -predictor.Run(); -auto* out = predictor.GetOutput(0); -``` +- relu/relu6/leakyrelu +- conv2d +- depthwise_conv2d +- pool2d +- fc + +## 准备工作 + +开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/AIGO_C5MB_UG.pdf) + +## 参考示例演示 + +### 测试设备(Roc1开发板) + +![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_front.jpg) + +![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_back.jpg) + +### 准备设备环境 + +- 提前准备带有intelfpgadrv.ko的英特尔FPGA开发板(如C5MB); +- 确定能够通过SSH方式远程登录C5MB开发板; +- 由于C5MB的ARM能力较弱,示例程序和PaddleLite库的编译均采用交叉编译方式。 + +### 准备交叉编译环境 + +- 按照以下两种方式配置交叉编译环境: + - Docker交叉编译环境:由于C5MB运行环境为Ubuntu16.04,因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image,而需要按照如下方式在Host机器上手动构建Ubuntu16.04的docker image; + + ``` + $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile + $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 . + $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash + ``` + +- Ubuntu交叉编译环境:要求Host为Ubuntu16.04系统,参考[编译环境准备](../source_compile/compile_env)中的"交叉编译ARM Linux"步骤安装交叉编译工具链。 +- 由于需要通过scp和ssh命令将交叉编译生成的PaddleLite库和示例程序传输到设备上执行,因此,在进入Docker容器后还需要安装如下软件: + + ``` + # apt-get install openssh-client sshpass + ``` + +### 运行图像检测示例程序 + +- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/PaddleLite-linux-demo.tar.gz),解压后清单如下: + + ```shell + - PaddleLite-linux-demo + - ssd_detection + - assets + - images + - dog.jpg # 测试图片 + - dog.raw # 已处理成raw数据的测试图片 + - labels + - pascalvoc_label_list # 检测label文件 + - models + - ssd_mobilenet_v1 # Non-combined格式的、SSD量化模型 + - __model__ # 已通过opt转好的拓扑信息模型文件 + - __params__ # 已通过opt转好的参数信息模型文件 + - shell + - CMakeLists.txt # 示例程序CMake脚本 + - build + - ssd_detection # 已编译好的示例程序 + - ssd_detection.cc # 示例程序源码 + - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本 + - build.sh # 示例程序编译脚本 + - run.sh # 示例程序运行脚本 + - libs + - PaddleLite + - armhf + - include # PaddleLite头文件 + - lib + - libvnna.so # 英特尔FPGA接口库 + - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库(full publish模式下编译生成的库) + ``` + +- 按照以下命令运行转换后的ARM+FPGA模型 + + ```shell + 注意: + 1)run.sh必须在Host机器上运行,且执行前需要配置目标设备的IP地址、SSH账号和密码; + 2)build.sh建议在docker环境中执行,目前英特尔FPGA在PaddleLite上只支持armhf。 + + 运行适用于英特尔FPGA的mobilenetv1全量化模型 + $ cd PaddleLite-linux-demo/ssd_detection/shell + $ vim ./run.sh + MODEL_NAME设置为ssd_mobilenet_v1 + $ ./run.sh + iter 0 cost: 3079.443115 ms + iter 1 cost: 3072.508057 ms + iter 2 cost: 3063.342041 ms + warmup: 1 repeat: 3, average: 3071.764404 ms, max: 3079.443115 ms, min: 3063.342041 ms + results: 3 + [0] bicycle - 0.997817 0.163673,0.217786,0.721802,0.786120 + [1] car - 0.943994 0.597238,0.131665,0.905698,0.297017 + [2] dog - 0.959329 0.157911,0.334807,0.431497,0.920035 + Preprocess time: 114.061000 ms + Prediction time: 3071.764404 ms + Postprocess time: 13.166000 ms + ``` + +- 如果需要更改测试图片,可通过convert_to_raw_image.py工具生成; +- 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行建议在docker环境中,否则可能编译出错。 + +### 更新支持英特尔FPGA的PaddleLite库 + +- 下载PaddleLite源码和英特尔FPGA的SDK + + ```shell + $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git + $ cd Paddle-Lite + $ git checkout + $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/intelfpga_sdk.tar.gz -o - | tar -zx + ``` + +- 编译并生成PaddleLite+IntelFPGA的部署库 + + ```shell + For C5MB + full_publish + $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intelfpga_sdk full_publish + ``` + +- 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; +- 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。 + +## 其它说明 From d716f590977f0c01dfda06c4278c60c957357f8c Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 24 Mar 2021 05:28:19 +0000 Subject: [PATCH 10/19] test=develop --- docs/demo_guides/intel_fpga.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 2d4dc721e53..4f644e838a6 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -93,6 +93,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本 - build.sh # 示例程序编译脚本 - run.sh # 示例程序运行脚本 + - intelfpgadrv.ko # 英特尔FPGA启动程序 - libs - PaddleLite - armhf From 51533df13363ffaa74c301e91aca3d90cecb1969 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 24 Mar 2021 09:17:28 +0000 Subject: [PATCH 11/19] test=develop --- cmake/device/intel_fpga.cmake | 5 +++-- lite/api/CMakeLists.txt | 17 +---------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/cmake/device/intel_fpga.cmake b/cmake/device/intel_fpga.cmake index e753b88a5ca..4665438eb5a 100644 --- a/cmake/device/intel_fpga.cmake +++ b/cmake/device/intel_fpga.cmake @@ -40,7 +40,8 @@ if(NOT INTEL_FPGA_SDK_LIB) message(FATAL_ERROR "Can not find INTEL_FPGA_LIB_FILE in ${INTEL_FPGA_SDK_ROOT}/lib") else() message(STATUS "Found INTEL_FPGA_SDK Library: ${INTEL_FPGA_SDK_LIB}") - link_directories(${INTEL_FPGA_SDK_ROOT}/lib) + add_library(intel_fpga_vnna SHARED IMPORTED GLOBAL) + set_property(TARGET intel_fpga_vnna PROPERTY IMPORTED_LOCATION ${INTEL_FPGA_SDK_LIB}) endif() -set(intel_fpga_runtime_libs vnna CACHE INTERNAL "intel fpga sdk runtime libs") +set(intel_fpga_runtime_libs intel_fpga_vnna CACHE INTERNAL "intel fpga sdk runtime libs") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index a061fa0234d..8eda4984f75 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -37,10 +37,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH if(LITE_WITH_CV) target_link_libraries(paddle_full_api_shared "-Wl,--whole-archive" paddle_cv_arm "-Wl,--no-whole-archive") endif(LITE_WITH_CV) - if (LITE_WITH_INTEL_FPGA) - # Need to add INTEL_FPGA runtime libs dependency - target_link_libraries(paddle_full_api_shared ${intel_fpga_runtime_libs}) - endif(LITE_WITH_INTEL_FPGA) + #light api dynamic library lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc DEPS ${light_lib_DEPS} @@ -142,10 +139,6 @@ else() #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs}) endif() - if (LITE_WITH_INTEL_FPGA) - # Need to add INTEL_FPGA runtime libs dependency - target_link_libraries(paddle_light_api_shared ${intel_fpga_runtime_libs}) - endif(LITE_WITH_INTEL_FPGA) # 3. produce java lib from `PADDLELITE_OBJS` if LITE_WITH_JAVA=ON if (LITE_WITH_JAVA) add_library(paddle_lite_jni SHARED $ android/jni/native/paddle_lite_jni.cc android/jni/native/tensor_jni.cc) @@ -475,16 +468,8 @@ if (NOT LITE_ON_TINY_PUBLISH) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) target_link_libraries(paddle_api_full ${cuda_deps}) - if (LITE_WITH_INTEL_FPGA) - # Need to add INTEL_FPGA runtime libs dependency - target_link_libraries(paddle_api_full ${intel_fpga_runtime_libs}) - endif() get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api) - if (LITE_WITH_INTEL_FPGA) - # Need to add INTEL_FPGA runtime libs dependency - target_link_libraries(paddle_api_light ${intel_fpga_runtime_libs}) - endif() endif() #----------------------------------------------------------------------------------------------------- From aaf4dfd26ab87fc23b06db1ea538a143c06bba57 Mon Sep 17 00:00:00 2001 From: xbeu Date: Wed, 24 Mar 2021 12:47:37 +0000 Subject: [PATCH 12/19] test=develop --- docs/demo_guides/intel_fpga.md | 36 +++++++++++-------- lite/CMakeLists.txt | 2 +- lite/api/opt.cc | 11 ++++-- lite/api/opt_base.cc | 11 ++++-- lite/api/python/pybind/pybind.cc | 1 + lite/core/arena/CMakeLists.txt | 2 +- lite/tools/build_linux.sh | 6 ++-- .../cmake_tools/record_supported_kernel_op.py | 3 +- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 4f644e838a6..98ff5f1b18d 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -22,7 +22,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 已支持的Paddle模型 -- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/ssd_mobilenet_v1.tar.gz) +- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz) ### 已支持(或部分支持)的Paddle算子 @@ -34,15 +34,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ## 准备工作 -开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/AIGO_C5MB_UG.pdf) +开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/AIGO_C5MB_UG.pdf) ## 参考示例演示 -### 测试设备(Roc1开发板) +### 测试设备(C5MB开发板) -![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_front.jpg) +![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_front.jpg) -![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/c5mb_back.jpg) +![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_back.jpg) ### 准备设备环境 @@ -53,7 +53,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 准备交叉编译环境 - 按照以下两种方式配置交叉编译环境: - - Docker交叉编译环境:由于C5MB运行环境为Ubuntu16.04,因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image,而需要按照如下方式在Host机器上手动构建Ubuntu16.04的docker image; + - Docker交叉编译环境:由于C5MB运行环境为Ubuntu,因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image,而需要按照如下方式在Host机器上手动构建Ubuntu的docker image; ``` $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile @@ -70,7 +70,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 运行图像检测示例程序 -- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/PaddleLite-linux-demo.tar.gz),解压后清单如下: +- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/PaddleLite-linux-demo.tar.gz),解压后清单如下: ```shell - PaddleLite-linux-demo @@ -93,13 +93,14 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本 - build.sh # 示例程序编译脚本 - run.sh # 示例程序运行脚本 - - intelfpgadrv.ko # 英特尔FPGA启动程序 + - intelfpgadrv.ko # 英特尔FPGA内核驱动程序 - libs - PaddleLite - armhf - include # PaddleLite头文件 - lib - libvnna.so # 英特尔FPGA接口库 + - libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库(tiny publish模式下编译生成的库) - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库(full publish模式下编译生成的库) ``` @@ -139,18 +140,25 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git $ cd Paddle-Lite $ git checkout - $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/intelfpga_sdk.tar.gz -o - | tar -zx + $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/intel_fpga_sdk.tar.gz -o - | tar -zx ``` - 编译并生成PaddleLite+IntelFPGA的部署库 - ```shell For C5MB - full_publish - $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intelfpga_sdk full_publish + - tiny_publish编译方式 + ```shell + $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk + + 将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件; + ``` + - full_publish编译方式 + ```shell + $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk full_publish + + 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。 ``` -- 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; -- 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。 + - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; ## 其它说明 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 4a4f8af5848..47911ad5c52 100755 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -135,7 +135,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna") endif(LITE_WITH_IMAGINATION_NNA) if (LITE_WITH_INTEL_FPGA) - set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intelfpga") + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.intel_fpga") endif(LITE_WITH_INTEL_FPGA) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 215beddda46..fa8d75d3e87 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -155,6 +155,10 @@ std::vector ParserValidPlaces(bool enable_fp16) { valid_places.emplace_back(TARGET(kImaginationNNA)); valid_places.emplace_back( Place{TARGET(kImaginationNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)}); + } else if (target_repr == "intel_fpga") { + valid_places.emplace_back(TARGET(kIntelFPGA)); + valid_places.emplace_back( + Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -245,6 +249,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kAPU", "kHuaweiAscendNPU", "kImaginationNNA", + "kIntelFPGA", "kAny", "kUnk"}; size_t maximum_optype_length = 0; @@ -316,7 +321,7 @@ void PrintHelpInfo() { " " "`--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_" "ascend_npu|" - "imagination_nna)`\n" + "imagination_nna|intel_fpga)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of mode quantization in opt:\n" " `--quant_model=(true|false)`\n" @@ -329,13 +334,13 @@ void PrintHelpInfo() { " `--print_supported_ops=true " "--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_" "ascend_npu|" - "imagination_nna)" + "imagination_nna|intel_fpga)" "`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " "--valid_targets=(arm|opencl|x86|x86_opencl|npu|xpu|rknpu|apu|huawei_" "ascend_npu|" - "imagination_nna)" + "imagination_nna|intel_fpga)" "`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index 540d5c831c5..f3e4b21269c 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -107,6 +107,10 @@ void OptBase::SetValidPlaces(const std::string& valid_places, valid_places_.emplace_back(TARGET(kImaginationNNA)); valid_places_.emplace_back( Place{TARGET(kImaginationNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)}); + } else if (target_repr == "intel_fpga") { + valid_places.emplace_back(TARGET(kIntelFPGA)); + valid_places.emplace_back( + Place{TARGET(kIntelFPGA), PRECISION(kFloat), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -304,7 +308,7 @@ void OptBase::PrintExecutableBinHelpInfo() { " `--optimize_out=`\n" " " "`--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_" - "nna)`\n" + "nna|intel_fpga)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of mode quantization in opt:\n" " `--quant_model=(true|false)`\n" @@ -316,11 +320,11 @@ void OptBase::PrintExecutableBinHelpInfo() { "Paddle-Lite\n" " `--print_supported_ops=true " "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_" - "nna)`" + "nna|intel_fpga)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|imagination_" - "nna)`" + "nna|intel_fpga)`" " Display operators in the input model\n"; std::cout << "paddlelite opt version:" << opt_version << std::endl << help_info << std::endl; @@ -340,6 +344,7 @@ void OptBase::PrintOpsInfo(const std::set& valid_ops) { "kAPU", "kHuaweiAscendNPU", "kImaginationNNA", + "kIntelFPGA", "kAny", "kUnk"}; // Get the lengh of the first column: maximum length of the op_type diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index c4b9ce6d523..68feb551535 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -202,6 +202,7 @@ void BindLitePlace(py::module *m) { .value("APU", TargetType::kAPU) .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU) .value("IMAGINATION_NNA", TargetType::kImaginationNNA) + .value("INTEL_FPGA", TargetType::kIntelFPGA) .value("Any", TargetType::kAny); // PrecisionType diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 9c86bf2649d..6441c1eee25 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${apu_kernels} ${huawei_ascend_npu_kernels} ${imagination_nna_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${apu_kernels} ${huawei_ascend_npu_kernels} ${imagination_nna_kernels} ${intel_fpga_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index 0a62f90793a..7e23cf08baf 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -5,7 +5,7 @@ set -e # 1. global variables, you can change them according to your requirements ##################################################################################################### # armv8 or armv7hf or armv7, default armv8. -ARCH=armv7hf +ARCH=armv8 # gcc or clang, default gcc. TOOLCHAIN=gcc # ON or OFF, default OFF. @@ -34,8 +34,8 @@ IMAGINATION_NNA_SDK_ROOT="$(pwd)/imagination_nna_sdk" WITH_BAIDU_XPU=OFF BAIDU_XPU_SDK_ROOT="" # options of compiling intel fpga. -WITH_INTEL_FPGA=ON -INTEL_FPGA_SDK_ROOT="$(pwd)/intelfpga_sdk" +WITH_INTEL_FPGA=OFF +INTEL_FPGA_SDK_ROOT="$(pwd)/intel_fpga_sdk" # options of adding training ops WITH_TRAIN=OFF # num of threads used during compiling.. diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index 0e36569f501..36343f3292b 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -56,7 +56,7 @@ ops_lines = [] # valid targets and valid_ops -valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA"] +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","KIntelFPGA"] valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] class TargetType: kUnk = 0 @@ -75,6 +75,7 @@ class TargetType: kAPU = 13 kHuaweiAscendNPU = 14 kImaginationNNA = 15 + kIntelFPGA = 16 # record op_info of valid kernels into `valid_ops` according to different target type From 063e302e01ccabc656a01b8d4cd056b547dd5679 Mon Sep 17 00:00:00 2001 From: xbeu Date: Thu, 25 Mar 2021 05:39:36 +0000 Subject: [PATCH 13/19] test=develop --- docs/demo_guides/intel_fpga.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 98ff5f1b18d..f564fc8707d 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -18,11 +18,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 已支持的设备 -- 海运捷讯C5MB开发板 +- 海运捷讯C5MB(英特尔FPGA Cyclone V)开发板 +- 海运捷讯C5CB(英特尔FPGA Cyclone V)开发板 +- 海运捷讯C5TB(英特尔FPGA Cyclone V)开发板 ### 已支持的Paddle模型 -- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz) +- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/mobilenet_v1.tar.gz) +- [全量化SSD_MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz) +- [全量化YOLOV3](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/yolov3.tar.gz) ### 已支持(或部分支持)的Paddle算子 @@ -56,7 +60,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - Docker交叉编译环境:由于C5MB运行环境为Ubuntu,因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image,而需要按照如下方式在Host机器上手动构建Ubuntu的docker image; ``` - $ wget https://paddlelite-demo.bj.bcebos.com/devices/intelfpga/Dockerfile + $ wget https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/Dockerfile $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 . $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash ``` @@ -82,9 +86,10 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - labels - pascalvoc_label_list # 检测label文件 - models - - ssd_mobilenet_v1 # Non-combined格式的、SSD量化模型 + - ssd_mobilenet_v1 # Combined格式的protobuf量化模型 - __model__ # 已通过opt转好的拓扑信息模型文件 - __params__ # 已通过opt转好的参数信息模型文件 + - ssd_mobilenet_v1.nb # 已通过opt转好的、适合ARM CPU的naive_buffer量化模型 - shell - CMakeLists.txt # 示例程序CMake脚本 - build @@ -99,7 +104,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - armhf - include # PaddleLite头文件 - lib - - libvnna.so # 英特尔FPGA接口库 + - libvnna.so # 英特尔FPGA推理运行时库 - libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库(tiny publish模式下编译生成的库) - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库(full publish模式下编译生成的库) ``` From 71568a2388857f3a95b020a82d50075338620f6b Mon Sep 17 00:00:00 2001 From: xbeu Date: Thu, 25 Mar 2021 05:54:47 +0000 Subject: [PATCH 14/19] test=develop --- docs/demo_guides/intel_fpga.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index f564fc8707d..8e0cc037e87 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -155,7 +155,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ```shell $ ./lite/tools/build_linux.sh --arch=armv7hf --with_extra=ON --with_log=ON --with_intel_fpga=ON --intel_fpga_sdk_root=./intel_fpga_sdk - 将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件; + 将tiny_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件; ``` - full_publish编译方式 ```shell From de84566917250f92f2916452420ab8d6cd957d7d Mon Sep 17 00:00:00 2001 From: xbeu Date: Fri, 26 Mar 2021 10:39:50 +0000 Subject: [PATCH 15/19] test=develop --- docs/demo_guides/intel_fpga.md | 53 ++++++++++--------- lite/api/opt_base.cc | 2 +- .../cmake_tools/record_supported_kernel_op.py | 4 +- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 8e0cc037e87..b9f59000874 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -24,9 +24,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 已支持的Paddle模型 -- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/mobilenet_v1.tar.gz) -- [全量化SSD_MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/ssd_mobilenet_v1.tar.gz) -- [全量化YOLOV3](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/yolov3.tar.gz) +- [ssd_mobilenet_v1_pascalvoc](https://https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) ### 已支持(或部分支持)的Paddle算子 @@ -38,15 +36,15 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ## 准备工作 -开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/AIGO_C5MB_UG.pdf) +开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel/AIGO_C5MB_UG.pdf) ## 参考示例演示 ### 测试设备(C5MB开发板) -![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_front.jpg) +![c5mb_front](https://paddlelite-demo.bj.bcebos.com/devices/intel/c5mb_front.jpg) -![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/c5mb_back.jpg) +![c5mb_back](https://paddlelite-demo.bj.bcebos.com/devices/intel/c5mb_back.jpg) ### 准备设备环境 @@ -56,16 +54,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 准备交叉编译环境 -- 按照以下两种方式配置交叉编译环境: - - Docker交叉编译环境:由于C5MB运行环境为Ubuntu,因此不能直接使用[编译环境准备](../source_compile/compile_env)中的docker image,而需要按照如下方式在Host机器上手动构建Ubuntu的docker image; - - ``` - $ wget https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/Dockerfile - $ docker build --network=host -t paddlepaddle/paddle-lite-ubuntu16_04:1.0 . - $ docker run --name paddle-lite-ubuntu16_04 --net=host -it --privileged -v $PWD:/Work -w /Work paddlepaddle/paddle-lite-ubuntu16_04:1.0 /bin/bash - ``` - -- Ubuntu交叉编译环境:要求Host为Ubuntu16.04系统,参考[编译环境准备](../source_compile/compile_env)中的"交叉编译ARM Linux"步骤安装交叉编译工具链。 +- 为了保证编译环境一致,建议参考[编译环境准备](../source_compile/compile_env)中的Docker开发环境进行配置; - 由于需要通过scp和ssh命令将交叉编译生成的PaddleLite库和示例程序传输到设备上执行,因此,在进入Docker容器后还需要安装如下软件: ``` @@ -74,7 +63,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 运行图像检测示例程序 -- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/PaddleLite-linux-demo.tar.gz),解压后清单如下: +- 下载示例程序[PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/intel/PaddleLite-linux-demo.tar.gz),解压后清单如下: ```shell - PaddleLite-linux-demo @@ -86,10 +75,8 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - labels - pascalvoc_label_list # 检测label文件 - models - - ssd_mobilenet_v1 # Combined格式的protobuf量化模型 - - __model__ # 已通过opt转好的拓扑信息模型文件 - - __params__ # 已通过opt转好的参数信息模型文件 - - ssd_mobilenet_v1.nb # 已通过opt转好的、适合ARM CPU的naive_buffer量化模型 + - ssd_mobilenet_v1_fp32_300_for_intel_fpga + - model.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型 - shell - CMakeLists.txt # 示例程序CMake脚本 - build @@ -116,10 +103,10 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 1)run.sh必须在Host机器上运行,且执行前需要配置目标设备的IP地址、SSH账号和密码; 2)build.sh建议在docker环境中执行,目前英特尔FPGA在PaddleLite上只支持armhf。 - 运行适用于英特尔FPGA的mobilenetv1全量化模型 + 运行适用于英特尔FPGA的ssd_mobilenet_v1量化模型 $ cd PaddleLite-linux-demo/ssd_detection/shell $ vim ./run.sh - MODEL_NAME设置为ssd_mobilenet_v1 + MODEL_NAME设置为ssd_mobilenet_v1_fp32_300_for_intel_fpga $ ./run.sh iter 0 cost: 3079.443115 ms iter 1 cost: 3072.508057 ms @@ -137,6 +124,22 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - 如果需要更改测试图片,可通过convert_to_raw_image.py工具生成; - 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行建议在docker环境中,否则可能编译出错。 +### 更新模型 + +- 通过Paddle Fluid训练,或X2Paddle转换得到MobileNetv1 foat32模型[ssd_mobilenet_v1_fp32_300_fluid](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz); +- 参考[模型转化方法](../user_guides/model_optimize_tool),利用opt工具转换生成英特尔FPGA模型,仅需要将valid_targets设置为intel_fpga,arm即可。 + ```shell + $ ./opt --model_dir=ssd_mobilenet_v1_fp32_300_for_intel_fpga \ + --optimize_out_type=naive_buffer \ + --optimize_out=opt_model \ + --valid_targets=intel_fpga,arm + + 替换自带的英特尔FPGA模型 + $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/model.nb + ``` + +- 注意:opt生成的模型只是标记了英特尔FPGA支持的Paddle算子,并没有真正生成英特尔FPGA模型,只有在执行时才会将标记的Paddle算子转成英特尔FPGA的APIs,最终生成并执行模型。 + ### 更新支持英特尔FPGA的PaddleLite库 - 下载PaddleLite源码和英特尔FPGA的SDK @@ -145,7 +148,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git $ cd Paddle-Lite $ git checkout - $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel_fpga/intel_fpga_sdk.tar.gz -o - | tar -zx + $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk.tar.gz -o - | tar -zx ``` - 编译并生成PaddleLite+IntelFPGA的部署库 @@ -164,6 +167,6 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 将full_publish模式下编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。 ``` - - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intelfpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; + - 将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; ## 其它说明 diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index f3e4b21269c..09a4ac30d90 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -269,7 +269,7 @@ void OptBase::PrintHelpInfo() { " `set_lite_out(output_optimize_model_dir)`\n" " " "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|" - "imagination_nna)`\n" + "imagination_nna|intel_fpga)`\n" " `record_model_info(false|true)`: refer to whether to record ops " "info for striping lib, false by default`\n" " `run() : start model transformation`\n" diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index 36343f3292b..4ec0d3a2689 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -56,8 +56,8 @@ ops_lines = [] # valid targets and valid_ops -valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","KIntelFPGA"] -valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kImaginationNNA","kIntelFPGA"] +valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] class TargetType: kUnk = 0 kHost = 1 From e39781d91e7210dcfd254aa2abeea3383a46681d Mon Sep 17 00:00:00 2001 From: xbeu Date: Mon, 29 Mar 2021 09:53:59 +0000 Subject: [PATCH 16/19] test=develop --- lite/kernels/intel_fpga/conv_compute.cc | 4 ++-- lite/kernels/intel_fpga/conv_gemmlike.cc | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lite/kernels/intel_fpga/conv_compute.cc b/lite/kernels/intel_fpga/conv_compute.cc index 763ca83c7a2..4d2d55feca3 100644 --- a/lite/kernels/intel_fpga/conv_compute.cc +++ b/lite/kernels/intel_fpga/conv_compute.cc @@ -59,10 +59,10 @@ void ConvCompute::PrepareForRun() { /// select conv impl if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) { impl_ = new DepthwiseConv; - // VLOG(3) << "invoking dw conv"; + VLOG(3) << "[IntelFPGA] invoking depthwise conv"; } else { impl_ = new GemmLikeConv; - // VLOG(3) << "invoking gemm like conv"; + VLOG(3) << "[IntelFPGA] invoking common conv"; } if (!arm_cxt_) { arm_cxt_ = ContextScheduler::Global().NewContext(TargetType::kARM); diff --git a/lite/kernels/intel_fpga/conv_gemmlike.cc b/lite/kernels/intel_fpga/conv_gemmlike.cc index 849dabc3dcf..8dfcde783b9 100644 --- a/lite/kernels/intel_fpga/conv_gemmlike.cc +++ b/lite/kernels/intel_fpga/conv_gemmlike.cc @@ -16,6 +16,7 @@ #include #include "lite/backends/arm/math/gemm_prepacked_int8.h" #include "lite/backends/arm/math/packed_sgemm.h" +#include "lite/utils/logging.h" namespace paddle { namespace lite { @@ -109,7 +110,7 @@ void GemmLikeConv::Run() { conv.op.oh = o_dims[2]; conv.op.ow = o_dims[3]; if (intelfpga_conv2d(&conv)) { - std::cout << "intel_fpga_conv error" << std::endl; + LOG(WARNING) << "[IntelFPGA] Conv_Compute failed"; } } else { if (flag_1x1gemm_) { From f5a0046884a53e55bb88d64a8831ee7f89375d2e Mon Sep 17 00:00:00 2001 From: xbeu Date: Mon, 29 Mar 2021 11:23:07 +0000 Subject: [PATCH 17/19] test=develop --- docs/demo_guides/intel_fpga.md | 13 +- docs/introduction/support_hardware.md | 5 + docs/introduction/support_operation_list.md | 412 ++++++++++---------- 3 files changed, 218 insertions(+), 212 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index b9f59000874..187f82f563b 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -28,15 +28,16 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 已支持(或部分支持)的Paddle算子 -- relu/relu6/leakyrelu - conv2d - depthwise_conv2d -- pool2d -- fc ## 准备工作 开发板C5MB可以通过串口线进行连接,也可以通过ssh进行连接,初次使用请参考[文档](https://paddlelite-demo.bj.bcebos.com/devices/intel/AIGO_C5MB_UG.pdf) +可以通过串口完成C5MB开发板的IP修改: + ``` + $ vi /etc/network/interfaces # 设备网络配置文件,将对应的address,netmask,和gateway设置为指定的地址即可。 + ``` ## 参考示例演示 @@ -76,7 +77,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - pascalvoc_label_list # 检测label文件 - models - ssd_mobilenet_v1_fp32_300_for_intel_fpga - - model.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型 + - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型 - shell - CMakeLists.txt # 示例程序CMake脚本 - build @@ -85,7 +86,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本 - build.sh # 示例程序编译脚本 - run.sh # 示例程序运行脚本 - - intelfpgadrv.ko # 英特尔FPGA内核驱动程序 + - intelfpgadrv.ko # 英特尔FPGA内核驱动程序 - libs - PaddleLite - armhf @@ -135,7 +136,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 --valid_targets=intel_fpga,arm 替换自带的英特尔FPGA模型 - $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/model.nb + $ cp opt_model.nb ssd_mobilenet_v1_fp32_300_for_intel_fpga/ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb ``` - 注意:opt生成的模型只是标记了英特尔FPGA支持的Paddle算子,并没有真正生成英特尔FPGA模型,只有在执行时才会将标记的Paddle算子转成英特尔FPGA的APIs,最终生成并执行模型。 diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md index 3b72212eb87..59e103db752 100644 --- a/docs/introduction/support_hardware.md +++ b/docs/introduction/support_hardware.md @@ -56,6 +56,11 @@ Paddle Lite支持 瑞芯微 (Rockchip) NPU,支持列表如下: - 支持芯片:RK1808, RK1806,暂不支持RK3399Pro - 支持设备:RK1808/1806 EVB,TB-RK1808S0 +## 英特尔 (Intel) FPGA +Paddle Lite支持 英特尔 (Inel) FPGA,支持列表如下: +- 支持芯片:Cyclone V +- 支持设备:C5MB,C5TB和C5CB + ## 联发科 (MediaTek) APU Paddle Lite支持 联发科 (MediaTek) APU,支持列表如下: - 支持芯片:MT8168/MT8175,及其他智能芯片 diff --git a/docs/introduction/support_operation_list.md b/docs/introduction/support_operation_list.md index 14d49dc7c9c..4605e67e699 100644 --- a/docs/introduction/support_operation_list.md +++ b/docs/introduction/support_operation_list.md @@ -10,217 +10,217 @@ Host端Kernel是算子在任意CPU上纯C/C++的具体实现,具有可移植 举例PaddleLite在ARM上部署模型,如果模型中某个算子没有ARM端Kernel,但是有Host端Kerenel,那么模型优化阶段该算子会选择Host端Kerenel,该模型还是可以顺利部署。 -| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 颖脉NNA | -|-:|-|-|-|-|-|-|-|-|-|-|-| -| affine_channel |   |   |   | Y |   |   |   |   |   |   |  | -| affine_grid |   |   |   | Y |   |   |   |   |   |   |  | -| arg_max |   |   |   | Y |   |   |   |   |   |   |  | -| assign_value |   |   | Y | Y |   |   |   |   |   |   |  | -| batch_norm |   | Y |   | Y |   |   | Y | Y | Y |   |  | -| bilinear_interp |   |   | Y | Y | Y |   | Y |   |   |   |  | -| box_coder |   |   |   | Y | Y |   |   |   |   |   |  | -| calib |   |   | Y | Y |   | Y |   |   |   |   |  | -| cast |   | Y |   | Y |   |   |   | Y |   |   |  | -| concat |   | Y | Y | Y | Y |   | Y |   | Y | Y | | -| conv2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | -| conv2d_transpose |   |   |   | Y |   |   | Y |   |   | Y |  | -| density_prior_box |   |   |   | Y |   |   |   |   |   |   |  | -| depthwise_conv2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | -| depthwise_conv2d_transpose |   |   |   |   |   |   |   |   |   |   |  | -| dropout |   | Y | Y | Y | Y | Y | Y | Y |   |   |  | -| elementwise_add |   | Y | Y | Y | Y | Y | Y | Y | Y | Y |  | -| elementwise_div |   |   |   | Y |   |   | Y |   | Y |   |  | -| elementwise_max |   |   |   | Y |   |   |   |   |   |   |  | -| elementwise_mod |   |   |   | Y |   |   |   |   |   |   |  | -| elementwise_mul |   | Y | Y | Y | Y | Y | Y |   | Y | Y |  | -| elementwise_pow |   |   |   |   |   |   |   |   |   |   |  | -| elementwise_sub |   | Y | Y | Y | Y |   | Y |   | Y |   |  | -| elu |   |   |   | Y |   |   |   |   |   |   |  | -| expand | Y |   |   |   | Y |   | Y |   |   |   |  | -| expand_as | Y |   |   |   |   |   |   |   |   |   |  | -| fc |   | Y | Y | Y | Y | Y | Y |   | Y | Y | Y | -| feed | Y |   | Y |   |   | Y |   |   |   |   |  | -| fetch | Y |   |   |   |   | Y |   |   |   |   |  | -| fill_constant | Y |   |   |   |   |   |   |   |   |   |  | -| fill_constant_batch_size_like | Y | Y |   |   |   |   |   |   |   |   |  | -| flatten | Y |   |   |   | Y |   |   |   | Y |   |  | -| flatten2 | Y |   |   |   | Y |   |   |   | Y |   |  | -| fusion_elementwise_add_activation |   |   | Y | Y | Y | Y | Y |   |   | Y |  | -| fusion_elementwise_div_activation |   |   |   | Y |   |   | Y |   |   |   |  | -| fusion_elementwise_max_activation |   |   |   | Y |   |   |   |   |   |   |  | -| fusion_elementwise_mul_activation |   |   | Y | Y |   |   | Y |   |   |   |  | -| fusion_elementwise_sub_activation |   |   | Y | Y | Y |   | Y |   |   |   |  | -| grid_sampler |   |   |   | Y | Y |   |   |   |   |   |  | -| instance_norm |   |   |   | Y | Y |   | Y |   |   |   |  | -| io_copy |   |   | Y |   | Y | Y |   |   |   |   |  | -| io_copy_once |   |   | Y |   | Y | Y |   |   |   |   |  | -| layout |   |   | Y | Y | Y | Y |   |   |   |   |  | -| leaky_relu |   | Y | Y | Y | Y |   | Y |   |   |   |  | -| matmul |   | Y | Y | Y |   |   | Y | Y |   |   |  | -| mul |   | Y | Y | Y |   |   | Y | Y |   |   |  | -| multiclass_nms | Y |   |   |   |   | Y |   |   |   |   |  | -| multiclass_nms2 | Y |   |   |   |   |   |   |   |   |   |  | -| nearest_interp |   |   | Y | Y | Y |   | Y |   |   |   |  | -| pad2d |   |   |   | Y | Y |   | Y |   | Y |   |  | -| pool2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | -| prelu |   |   |   | Y |   |   |   |   |   |   |  | -| prior_box |   |   |   | Y |   | Y |   |   |   |   |  | -| range |   |   |   | Y |   |   |   |   |   |   |  | -| reduce_mean |   |   |   | Y |   |   | Y |   |   |   |  | -| relu |   | Y | Y | Y | Y |   | Y |   | Y | Y | Y | -| relu6 |   |   |   | Y | Y |   | Y |   | Y |   |  | -| reshape | Y | Y |   |   | Y |   | Y | Y |   |   |  | -| reshape2 | Y | Y |   |   | Y |   | Y | Y | Y |   |  | -| scale |   | Y | Y | Y | Y | Y | Y | Y | Y |   |  | -| search_fc |   | Y | Y |   |   |   |   |   |   |   |  | -| sequence_topk_avg_pooling |   | Y | Y |   |   |   |   |   |   |   |  | -| shuffle_channel |   |   |   | Y |   |   | Y |   |   |   |  | -| sigmoid |   | Y | Y | Y | Y |   | Y |   | Y |   |  | -| slice |   | Y |   | Y | Y |   |   | Y |   |   |  | -| softmax |   | Y | Y | Y |   |   | Y | Y | Y | Y |  | -| split |   |   |   | Y |   |   | Y |   |   |   |  | -| squeeze | Y |   |   |   |   |   |   |   |   |   |  | -| squeeze2 | Y |   |   |   |   |   |   |   |   |   |  | -| stack |   | Y |   | Y |   |   |   | Y |   |   |  | -| subgraph |   |   |   |   |   |   | Y | Y | Y | Y |  | -| tanh |   | Y | Y | Y | Y |   | Y | Y |   |   |  | -| thresholded_relu |   |   |   | Y |   |   | Y |   |   |   |  | -| transpose |   | Y | Y | Y | Y |   | Y | Y |   |   |  | -| transpose2 |   | Y | Y | Y | Y |   | Y | Y | Y |   |  | -| unsqueeze | Y |   |   |   |   |   | Y |   |   |   |  | -| unsqueeze2 | Y |   |   |   |   |   | Y |   |   |   |  | -| yolo_box |   |   | Y | Y |   |   |   | Y |   |   |  | +| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 颖脉NNA | 英特尔FPGA | +|-:|-|-|-|-|-|-|-|-|-|-|-|-| +| affine_channel |   |   |   | Y |   |   |   |   |   |   |  |  | +| affine_grid |   |   |   | Y |   |   |   |   |   |   |  |  | +| arg_max |   |   |   | Y |   |   |   |   |   |   |  |  | +| assign_value |   |   | Y | Y |   |   |   |   |   |   |  |  | +| batch_norm |   | Y |   | Y |   |   | Y | Y | Y |   |  |  | +| bilinear_interp |   |   | Y | Y | Y |   | Y |   |   |   |  |  | +| box_coder |   |   |   | Y | Y |   |   |   |   |   |  |  | +| calib |   |   | Y | Y |   | Y |   |   |   |   |  |  | +| cast |   | Y |   | Y |   |   |   | Y |   |   |  |  | +| concat |   | Y | Y | Y | Y |   | Y |   | Y | Y | |  | +| conv2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | +| conv2d_transpose |   |   |   | Y |   |   | Y |   |   | Y |  |  | +| density_prior_box |   |   |   | Y |   |   |   |   |   |   |  |  | +| depthwise_conv2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y | +| depthwise_conv2d_transpose |   |   |   |   |   |   |   |   |   |   |  |  | +| dropout |   | Y | Y | Y | Y | Y | Y | Y |   |   |  |  | +| elementwise_add |   | Y | Y | Y | Y | Y | Y | Y | Y | Y |  |  | +| elementwise_div |   |   |   | Y |   |   | Y |   | Y |   |  |  | +| elementwise_max |   |   |   | Y |   |   |   |   |   |   |  |  | +| elementwise_mod |   |   |   | Y |   |   |   |   |   |   |  |  | +| elementwise_mul |   | Y | Y | Y | Y | Y | Y |   | Y | Y |  |  | +| elementwise_pow |   |   |   |   |   |   |   |   |   |   |  |  | +| elementwise_sub |   | Y | Y | Y | Y |   | Y |   | Y |   |  |  | +| elu |   |   |   | Y |   |   |   |   |   |   |  |  | +| expand | Y |   |   |   | Y |   | Y |   |   |   |  |  | +| expand_as | Y |   |   |   |   |   |   |   |   |   |  |  | +| fc |   | Y | Y | Y | Y | Y | Y |   | Y | Y | Y |  | +| feed | Y |   | Y |   |   | Y |   |   |   |   |  |  | +| fetch | Y |   |   |   |   | Y |   |   |   |   |  |  | +| fill_constant | Y |   |   |   |   |   |   |   |   |   |  |  | +| fill_constant_batch_size_like | Y | Y |   |   |   |   |   |   |   |   |  |  | +| flatten | Y |   |   |   | Y |   |   |   | Y |   |  |  | +| flatten2 | Y |   |   |   | Y |   |   |   | Y |   |  |  | +| fusion_elementwise_add_activation |   |   | Y | Y | Y | Y | Y |   |   | Y |  |  | +| fusion_elementwise_div_activation |   |   |   | Y |   |   | Y |   |   |   |  |  | +| fusion_elementwise_max_activation |   |   |   | Y |   |   |   |   |   |   |  |  | +| fusion_elementwise_mul_activation |   |   | Y | Y |   |   | Y |   |   |   |  |  | +| fusion_elementwise_sub_activation |   |   | Y | Y | Y |   | Y |   |   |   |  |  | +| grid_sampler |   |   |   | Y | Y |   |   |   |   |   |  |  | +| instance_norm |   |   |   | Y | Y |   | Y |   |   |   |  |  | +| io_copy |   |   | Y |   | Y | Y |   |   |   |   |  |  | +| io_copy_once |   |   | Y |   | Y | Y |   |   |   |   |  |  | +| layout |   |   | Y | Y | Y | Y |   |   |   |   |  |  | +| leaky_relu |   | Y | Y | Y | Y |   | Y |   |   |   |  |  | +| matmul |   | Y | Y | Y |   |   | Y | Y |   |   |  |  | +| mul |   | Y | Y | Y |   |   | Y | Y |   |   |  |  | +| multiclass_nms | Y |   |   |   |   | Y |   |   |   |   |  |  | +| multiclass_nms2 | Y |   |   |   |   |   |   |   |   |   |  |  | +| nearest_interp |   |   | Y | Y | Y |   | Y |   |   |   |  |  | +| pad2d |   |   |   | Y | Y |   | Y |   | Y |   |  |  | +| pool2d |   | Y | Y | Y | Y | Y | Y | Y | Y | Y | Y |  | +| prelu |   |   |   | Y |   |   |   |   |   |   |  |  | +| prior_box |   |   |   | Y |   | Y |   |   |   |   |  |  | +| range |   |   |   | Y |   |   |   |   |   |   |  |  | +| reduce_mean |   |   |   | Y |   |   | Y |   |   |   |  |  | +| relu |   | Y | Y | Y | Y |   | Y |   | Y | Y | Y |  | +| relu6 |   |   |   | Y | Y |   | Y |   | Y |   |  |  | +| reshape | Y | Y |   |   | Y |   | Y | Y |   |   |  |  | +| reshape2 | Y | Y |   |   | Y |   | Y | Y | Y |   |  |  | +| scale |   | Y | Y | Y | Y | Y | Y | Y | Y |   |  |  | +| search_fc |   | Y | Y |   |   |   |   |   |   |   |  |  | +| sequence_topk_avg_pooling |   | Y | Y |   |   |   |   |   |   |   |  |  | +| shuffle_channel |   |   |   | Y |   |   | Y |   |   |   |  |  | +| sigmoid |   | Y | Y | Y | Y |   | Y |   | Y |   |  |  | +| slice |   | Y |   | Y | Y |   |   | Y |   |   |  |  | +| softmax |   | Y | Y | Y |   |   | Y | Y | Y | Y |  |  | +| split |   |   |   | Y |   |   | Y |   |   |   |  |  | +| squeeze | Y |   |   |   |   |   |   |   |   |   |  |  | +| squeeze2 | Y |   |   |   |   |   |   |   |   |   |  |  | +| stack |   | Y |   | Y |   |   |   | Y |   |   |  |  | +| subgraph |   |   |   |   |   |   | Y | Y | Y | Y |  |  | +| tanh |   | Y | Y | Y | Y |   | Y | Y |   |   |  |  | +| thresholded_relu |   |   |   | Y |   |   | Y |   |   |   |  |  | +| transpose |   | Y | Y | Y | Y |   | Y | Y |   |   |  |  | +| transpose2 |   | Y | Y | Y | Y |   | Y | Y | Y |   |  |  | +| unsqueeze | Y |   |   |   |   |   | Y |   |   |   |  |  | +| unsqueeze2 | Y |   |   |   |   |   | Y |   |   |   |  |  | +| yolo_box |   |   | Y | Y |   |   |   | Y |   |   |  |  | ### 附加算子 附加算子共计127个,需要在编译时打开`--build_extra=ON`开关才会编译,具体请参考[参数详情](../source_compile/library)。 -| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | -|-:|-|-|-|-|-|-|-|-|-|-| -| abs |   |   | Y | Y |   |   |   |   |   |   | -| anchor_generator |   |   |   | Y |   |   |   |   |   |   | +| OP Name | Host | X86 | CUDA | ARM | OpenCL | FPGA | 华为NPU | 百度XPU | 瑞芯微NPU | 联发科APU | 英特尔FPGA | +|-:|-|-|-|-|-|-|-|-|-|-|-| +| abs |   |   | Y | Y |   |   |   |   |   |   |   | +| anchor_generator |   |   |   | Y |   |   |   |   |   |   |   | | assign | Y |   |   |   |   |   |   |   |   |   | -| attention_padding_mask |   |   |   |   |   |   |   |   |   |   | -| axpy |   |   |   | Y |   |   |   |   |   |   | -| beam_search_decode |   |   |   | Y |   |   |   |   |   |   | -| beam_search_decode |   |   |   | Y |   |   |   |   |   |   | -| box_clip |   |   |   | Y |   |   |   |   |   |   | -| calib_once |   |   | Y | Y |   | Y |   |   |   |   | -| clip |   |   |   | Y |   |   |   |   |   |   | -| collect_fpn_proposals |   |   |   | Y |   |   |   |   |   |   | -| conditional_block | Y |   |   |   |   |   |   |   |   |   | -| crf_decoding | Y |   |   |   |   |   |   |   |   |   | -| crop |   |   |   | Y |   |   |   |   |   |   | -| ctc_align | Y |   |   |   |   |   |   |   |   |   | -| decode_bboxes |   |   |   | Y |   |   |   |   |   |   | -| deformable_conv |   |   |   | Y |   |   |   |   |   |   | -| distribute_fpn_proposals |   |   |   | Y |   |   |   |   |   |   | -| equal | Y |   |   |   |   |   |   |   |   |   | -| exp |   |   |   | Y | Y |   |   |   |   |   | -| fake_channel_wise_dequantize_max_abs |   |   |   |   |   |   |   |   |   |   | -| fake_dequantize_max_abs |   |   |   |   |   |   |   |   |   |   | -| fake_quantize_abs_max |   |   |   |   |   |   |   |   |   |   | -| fake_quantize_dequantize_abs_max |   |   |   |   |   |   |   |   |   |   | -| fake_quantize_dequantize_moving_average_abs_max |   |   |   |   |   |   |   |   |   |   | -| fake_quantize_moving_average_abs_max |   |   |   |   |   |   |   |   |   |   | -| fake_quantize_range_abs_max |   |   |   |   |   |   |   |   |   |   | -| floor |   |   |   | Y |   |   |   |   |   |   | -| gather |   | Y |   | Y |   |   |   | Y |   |   | -| gelu |   | Y |   |   |   |   |   |   |   |   | -| generate_proposals |   |   |   | Y |   |   |   |   |   |   | -| greater_equal | Y |   |   |   |   |   |   |   |   |   | -| greater_than | Y |   |   |   |   |   |   |   |   |   | -| group_norm |   |   |   | Y |   |   |   |   |   |   | -| gru |   | Y | Y | Y |   | Y |   |   |   |   | -| gru_unit |   |   |   | Y |   |   |   |   |   |   | -| hard_sigmoid |   |   |   | Y | Y |   | Y |   |   |   | -| hard_swish |   |   |   | Y |   |   |   |   |   |   | -| im2sequence |   |   |   | Y |   |   |   |   |   |   | -| increment |   |   |   | Y |   |   | Y |   |   |   | -| is_empty | Y |   |   |   |   |   |   |   |   |   | -| layer_norm |   | Y |   | Y |   |   | Y | Y |   |   | -| layout_once |   |   | Y | Y |   | Y |   |   |   |   | -| less_equal | Y |   |   |   |   |   |   |   |   |   | -| less_than | Y |   |   |   |   |   | Y |   |   |   | -| lod_reset |   |   |   | Y |   |   |   |   |   |   | -| log |   |   |   | Y |   |   | Y |   |   |   | -| logical_and | Y |   |   |   |   |   |   |   |   |   | -| logical_not | Y |   |   |   |   |   |   |   |   |   | -| logical_or | Y |   |   |   |   |   |   |   |   |   | -| logical_xor | Y |   |   |   |   |   |   |   |   |   | -| lookup_table |   | Y | Y | Y |   |   |   | Y |   |   | -| lookup_table_dequant |   |   |   | Y |   |   |   |   |   |   | -| lookup_table_v2 |   | Y | Y | Y |   |   |   |   |   |   | -| lrn |   |   |   | Y | Y |   |   |   |   |   | -| lstm |   |   |   | Y |   |   |   |   |   |   | -| match_matrix_tensor |   | Y | Y |   |   |   |   |   |   |   | -| max_pool2d_with_index |   |   |   |   |   |   |   |   |   |   | -| mean |   |   |   | Y |   |   |   |   |   |   | -| merge_lod_tensor |   |   |   | Y |   |   |   |   |   |   | -| negative |   |   |   | Y |   |   |   |   |   |   | -| norm |   |   |   | Y |   | Y |   |   |   |   | -| not_equal | Y |   |   |   |   |   |   |   |   |   | -| one_hot | Y |   |   |   |   |   |   |   |   |   | -| pixel_shuffle | Y |   |   | Y | Y |   |   |   |   |   | -| pow |   |   |   | Y |   |   |   |   |   |   | -| print | Y |   |   |   |   |   |   |   |   |   | -| read_from_array | Y |   |   |   |   |   |   |   |   |   | -| reciprocal |   |   |   | Y |   |   |   |   |   |   | -| reduce_max |   |   |   | Y |   |   |   |   |   |   | -| reduce_prod |   |   |   | Y |   |   |   |   |   |   | -| reduce_sum |   | Y |   |   |   |   |   | Y |   |   | -| relu_clipped |   |   |   | Y |   |   | Y |   |   |   | -| retinanet_detection_output | Y |   |   |   |   |   |   |   |   |   | -| roi_align |   |   |   | Y |   |   |   |   |   |   | -| rsqrt |   |   |   | Y |   |   |   |   |   |   | -| search_aligned_mat_mul |   | Y | Y |   |   |   |   |   |   |   | -| search_attention_padding_mask |   | Y | Y |   |   |   |   |   |   |   | -| search_grnn |   | Y | Y |   |   |   |   |   |   |   | -| search_group_padding |   | Y | Y |   |   |   |   |   |   |   | -| search_seq_arithmetic |   | Y | Y |   |   |   |   |   |   |   | -| search_seq_depadding |   | Y | Y |   |   |   |   |   |   |   | -| search_seq_fc |   | Y | Y |   |   |   |   |   |   |   | -| search_seq_softmax |   | Y | Y |   |   |   |   |   |   |   | -| sequence_arithmetic |   | Y | Y |   |   |   |   |   |   |   | -| sequence_concat |   | Y | Y |   |   |   |   |   |   |   | -| sequence_conv |   | Y |   | Y |   |   |   |   |   |   | -| sequence_expand |   |   |   | Y |   |   |   |   |   |   | -| sequence_expand_as |   | Y |   |   |   |   |   |   |   |   | -| sequence_mask |   |   | Y |   |   |   |   |   |   |   | -| sequence_pad |   |   | Y |   |   |   |   |   |   |   | -| sequence_pool |   | Y | Y | Y |   |   |   |   |   |   | -| sequence_pool_concat |   |   | Y |   |   |   |   |   |   |   | -| sequence_reshape |   | Y |   |   |   |   |   |   |   |   | -| sequence_reverse |   | Y | Y |   |   |   |   |   |   |   | -| sequence_reverse_embedding |   |   | Y |   |   |   |   |   |   |   | -| sequence_softmax |   |   |   | Y |   |   |   |   |   |   | -| sequence_unpad | Y | | Y |   |   |   |   |   |   |   | -| shape | Y | Y |   |   |   |   |   |   |   |   | -| sign |   |   |   |   |   |   |   |   |   |   | -| softsign |   | Y |   |   |   |   | Y |   |   |   | -| split_lod_tensor |   |   |   | Y |   |   |   |   |   |   | -| sqrt |   |   |   |   |   |   | Y |   |   |   | -| square |   | Y |   | Y |   |   | Y |   |   |   | -| swish |   |   |   | Y | Y |   |   |   |   |   | -| top_k |   |   |   | Y |   |   |   |   |   |   | -| topk_pooling |   |   | Y |   |   |   |   |   |   |   | -| uniform_random |   |   |   |   |   |   |   |   |   |   | -| var_conv_2d |   | Y | Y |   |   |   |   |   |   |   | -| where_index | Y |   |   |   |   |   |   |   |   |   | -| while | Y |   |   |   |   |   |   |   |   |   | -| write_to_array | Y |   |   |   |   |   |   |   |   |   | -| __xpu__conv2d |   |   |   |   |   |   |   | Y |   |   | -| __xpu__embedding_with_eltwise_add |   |   |   |   |   |   |   | Y |   |   | -| __xpu__fc |   |   |   |   |   |   |   | Y |   |   | -| __xpu__mmdnn_bid_emb_att |   |   |   |   |   |   |   | Y |   |   | -| __xpu__mmdnn_bid_emb_grnn_att |   |   |   |   |   |   |   | Y |   |   | -| __xpu__mmdnn_bid_emb_grnn_att2 |   |   |   |   |   |   |   | Y |   |   | -| __xpu__mmdnn_match_conv_topk |   |   |   |   |   |   |   | Y |   |   | +| attention_padding_mask |   |   |   |   |   |   |   |   |   |   |   | +| axpy |   |   |   | Y |   |   |   |   |   |   |   | +| beam_search_decode |   |   |   | Y |   |   |   |   |   |   |   | +| beam_search_decode |   |   |   | Y |   |   |   |   |   |   |   | +| box_clip |   |   |   | Y |   |   |   |   |   |   |   | +| calib_once |   |   | Y | Y |   | Y |   |   |   |   |   | +| clip |   |   |   | Y |   |   |   |   |   |   |   | +| collect_fpn_proposals |   |   |   | Y |   |   |   |   |   |   |   | +| conditional_block | Y |   |   |   |   |   |   |   |   |   |   | +| crf_decoding | Y |   |   |   |   |   |   |   |   |   |   | +| crop |   |   |   | Y |   |   |   |   |   |   |   | +| ctc_align | Y |   |   |   |   |   |   |   |   |   |   | +| decode_bboxes |   |   |   | Y |   |   |   |   |   |   |   | +| deformable_conv |   |   |   | Y |   |   |   |   |   |   |   | +| distribute_fpn_proposals |   |   |   | Y |   |   |   |   |   |   |   | +| equal | Y |   |   |   |   |   |   |   |   |   |   | +| exp |   |   |   | Y | Y |   |   |   |   |   |   | +| fake_channel_wise_dequantize_max_abs |   |   |   |   |   |   |   |   |   |   |   | +| fake_dequantize_max_abs |   |   |   |   |   |   |   |   |   |   |   | +| fake_quantize_abs_max |   |   |   |   |   |   |   |   |   |   |   | +| fake_quantize_dequantize_abs_max |   |   |   |   |   |   |   |   |   |   |   | +| fake_quantize_dequantize_moving_average_abs_max |   |   |   |   |   |   |   |   |   |   |   | +| fake_quantize_moving_average_abs_max |   |   |   |   |   |   |   |   |   |   |   | +| fake_quantize_range_abs_max |   |   |   |   |   |   |   |   |   |   |   | +| floor |   |   |   | Y |   |   |   |   |   |   |   | +| gather |   | Y |   | Y |   |   |   | Y |   |   |   | +| gelu |   | Y |   |   |   |   |   |   |   |   |   | +| generate_proposals |   |   |   | Y |   |   |   |   |   |   |   | +| greater_equal | Y |   |   |   |   |   |   |   |   |   |   | +| greater_than | Y |   |   |   |   |   |   |   |   |   |   | +| group_norm |   |   |   | Y |   |   |   |   |   |   |   | +| gru |   | Y | Y | Y |   | Y |   |   |   |   |   | +| gru_unit |   |   |   | Y |   |   |   |   |   |   |   | +| hard_sigmoid |   |   |   | Y | Y |   | Y |   |   |   |   | +| hard_swish |   |   |   | Y |   |   |   |   |   |   |   | +| im2sequence |   |   |   | Y |   |   |   |   |   |   |   | +| increment |   |   |   | Y |   |   | Y |   |   |   |   | +| is_empty | Y |   |   |   |   |   |   |   |   |   |   | +| layer_norm |   | Y |   | Y |   |   | Y | Y |   |   |   | +| layout_once |   |   | Y | Y |   | Y |   |   |   |   |   | +| less_equal | Y |   |   |   |   |   |   |   |   |   |   | +| less_than | Y |   |   |   |   |   | Y |   |   |   |   | +| lod_reset |   |   |   | Y |   |   |   |   |   |   |   | +| log |   |   |   | Y |   |   | Y |   |   |   |   | +| logical_and | Y |   |   |   |   |   |   |   |   |   |   | +| logical_not | Y |   |   |   |   |   |   |   |   |   |   | +| logical_or | Y |   |   |   |   |   |   |   |   |   |   | +| logical_xor | Y |   |   |   |   |   |   |   |   |   |   | +| lookup_table |   | Y | Y | Y |   |   |   | Y |   |   |   | +| lookup_table_dequant |   |   |   | Y |   |   |   |   |   |   |   | +| lookup_table_v2 |   | Y | Y | Y |   |   |   |   |   |   |   | +| lrn |   |   |   | Y | Y |   |   |   |   |   |   | +| lstm |   |   |   | Y |   |   |   |   |   |   |   | +| match_matrix_tensor |   | Y | Y |   |   |   |   |   |   |   |   | +| max_pool2d_with_index |   |   |   |   |   |   |   |   |   |   |   | +| mean |   |   |   | Y |   |   |   |   |   |   |   | +| merge_lod_tensor |   |   |   | Y |   |   |   |   |   |   |   | +| negative |   |   |   | Y |   |   |   |   |   |   |   | +| norm |   |   |   | Y |   | Y |   |   |   |   |   | +| not_equal | Y |   |   |   |   |   |   |   |   |   |   | +| one_hot | Y |   |   |   |   |   |   |   |   |   |   | +| pixel_shuffle | Y |   |   | Y | Y |   |   |   |   |   |   | +| pow |   |   |   | Y |   |   |   |   |   |   |   | +| print | Y |   |   |   |   |   |   |   |   |   |   | +| read_from_array | Y |   |   |   |   |   |   |   |   |   |   | +| reciprocal |   |   |   | Y |   |   |   |   |   |   |   | +| reduce_max |   |   |   | Y |   |   |   |   |   |   |   | +| reduce_prod |   |   |   | Y |   |   |   |   |   |   |   | +| reduce_sum |   | Y |   |   |   |   |   | Y |   |   |   | +| relu_clipped |   |   |   | Y |   |   | Y |   |   |   |   | +| retinanet_detection_output | Y |   |   |   |   |   |   |   |   |   |   | +| roi_align |   |   |   | Y |   |   |   |   |   |   |   | +| rsqrt |   |   |   | Y |   |   |   |   |   |   |   | +| search_aligned_mat_mul |   | Y | Y |   |   |   |   |   |   |   |   | +| search_attention_padding_mask |   | Y | Y |   |   |   |   |   |   |   |   | +| search_grnn |   | Y | Y |   |   |   |   |   |   |   |   | +| search_group_padding |   | Y | Y |   |   |   |   |   |   |   |   | +| search_seq_arithmetic |   | Y | Y |   |   |   |   |   |   |   |   | +| search_seq_depadding |   | Y | Y |   |   |   |   |   |   |   |   | +| search_seq_fc |   | Y | Y |   |   |   |   |   |   |   |   | +| search_seq_softmax |   | Y | Y |   |   |   |   |   |   |   |   | +| sequence_arithmetic |   | Y | Y |   |   |   |   |   |   |   |   | +| sequence_concat |   | Y | Y |   |   |   |   |   |   |   |   | +| sequence_conv |   | Y |   | Y |   |   |   |   |   |   |   | +| sequence_expand |   |   |   | Y |   |   |   |   |   |   |   | +| sequence_expand_as |   | Y |   |   |   |   |   |   |   |   |   | +| sequence_mask |   |   | Y |   |   |   |   |   |   |   |   | +| sequence_pad |   |   | Y |   |   |   |   |   |   |   |   | +| sequence_pool |   | Y | Y | Y |   |   |   |   |   |   |   | +| sequence_pool_concat |   |   | Y |   |   |   |   |   |   |   |   | +| sequence_reshape |   | Y |   |   |   |   |   |   |   |   |   | +| sequence_reverse |   | Y | Y |   |   |   |   |   |   |   |   | +| sequence_reverse_embedding |   |   | Y |   |   |   |   |   |   |   |   | +| sequence_softmax |   |   |   | Y |   |   |   |   |   |   |   | +| sequence_unpad | Y | | Y |   |   |   |   |   |   |   |   | +| shape | Y | Y |   |   |   |   |   |   |   |   |   | +| sign |   |   |   |   |   |   |   |   |   |   |   | +| softsign |   | Y |   |   |   |   | Y |   |   |   |   | +| split_lod_tensor |   |   |   | Y |   |   |   |   |   |   |   | +| sqrt |   |   |   |   |   |   | Y |   |   |   |   | +| square |   | Y |   | Y |   |   | Y |   |   |   |   | +| swish |   |   |   | Y | Y |   |   |   |   |   |   | +| top_k |   |   |   | Y |   |   |   |   |   |   |   | +| topk_pooling |   |   | Y |   |   |   |   |   |   |   |   | +| uniform_random |   |   |   |   |   |   |   |   |   |   |   | +| var_conv_2d |   | Y | Y |   |   |   |   |   |   |   |   | +| where_index | Y |   |   |   |   |   |   |   |   |   |   | +| while | Y |   |   |   |   |   |   |   |   |   |   | +| write_to_array | Y |   |   |   |   |   |   |   |   |   |   | +| __xpu__conv2d |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__embedding_with_eltwise_add |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__fc |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__mmdnn_bid_emb_att |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__mmdnn_bid_emb_grnn_att |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__mmdnn_bid_emb_grnn_att2 |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__mmdnn_match_conv_topk |   |   |   |   |   |   |   | Y |   |   |   | | __xpu__mmdnn_merge_all |   |   |   |   |   |   |   | Y |   |   | -| __xpu__mmdnn_search_attention |   |   |   |   |   |   |   | Y |   |   | -| __xpu__multi_encoder |   |   |   |   |   |   |   | Y |   |   | -| __xpu__resnet_cbam |   |   |   |   |   |   |   | Y |   |   | -| __xpu__resnet50 |   |   |   |   |   |   |   | Y |   |   | -| __xpu__sfa_head |   |   |   |   |   |   |   | Y |   |   | -| matrix_nms | Y |   |   |   |   |   |   |   |   |   | +| __xpu__mmdnn_search_attention |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__multi_encoder |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__resnet_cbam |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__resnet50 |   |   |   |   |   |   |   | Y |   |   |   | +| __xpu__sfa_head |   |   |   |   |   |   |   | Y |   |   |   | +| matrix_nms | Y |   |   |   |   |   |   |   |   |   |   | From c22d58b23f63c4549aee6f3838156661b149fbc5 Mon Sep 17 00:00:00 2001 From: xbeu Date: Tue, 30 Mar 2021 09:17:34 +0000 Subject: [PATCH 18/19] test=develop --- docs/demo_guides/intel_fpga.md | 4 ++-- lite/tools/build_linux.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 187f82f563b..0c388c1c453 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -24,7 +24,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 ### 已支持的Paddle模型 -- [ssd_mobilenet_v1_pascalvoc](https://https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) +- [ssd_mobilenet_v1_pascalvoc](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) ### 已支持(或部分支持)的Paddle算子 @@ -149,7 +149,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 $ git clone https://github.com/PaddlePaddle/Paddle-Lite.git $ cd Paddle-Lite $ git checkout - $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk.tar.gz -o - | tar -zx + $ curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk_1.0.0.tar.gz -o - | tar -zx ``` - 编译并生成PaddleLite+IntelFPGA的部署库 diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index 7e23cf08baf..77c628d96e0 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -9,7 +9,7 @@ ARCH=armv8 # gcc or clang, default gcc. TOOLCHAIN=gcc # ON or OFF, default OFF. -WITH_EXTRA=ON +WITH_EXTRA=OFF # controls whether to compile python lib, default is OFF. WITH_PYTHON=OFF PY_VERSION="" From 709796428b54d9f920038450003f3022d6835c7f Mon Sep 17 00:00:00 2001 From: YIQUAN YOU <79895409+xbeu@users.noreply.github.com> Date: Tue, 30 Mar 2021 19:06:59 +0800 Subject: [PATCH 19/19] Update intel_fpga.md --- docs/demo_guides/intel_fpga.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/demo_guides/intel_fpga.md b/docs/demo_guides/intel_fpga.md index 0c388c1c453..2813893af17 100644 --- a/docs/demo_guides/intel_fpga.md +++ b/docs/demo_guides/intel_fpga.md @@ -76,8 +76,8 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - labels - pascalvoc_label_list # 检测label文件 - models - - ssd_mobilenet_v1_fp32_300_for_intel_fpga - - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型 + - ssd_mobilenet_v1_fp32_300_for_intel_fpga + - ssd_mobilenet_v1_fp32_300_for_intel_fpga.nb # 已通过opt转好的、适合英特尔FPGA的mobilenetv1量化模型 - shell - CMakeLists.txt # 示例程序CMake脚本 - build @@ -93,7 +93,7 @@ PaddleLite支持英特尔FPGA作为后端硬件进行模型推理,其主要特 - include # PaddleLite头文件 - lib - libvnna.so # 英特尔FPGA推理运行时库 - - libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库(tiny publish模式下编译生成的库) + - libpaddle_light_api_shared.so # 用于最终移动端部署的预编译PaddleLite库(tiny publish模式下编译生成的库) - libpaddle_full_api_shared.so # 用于直接加载Paddle模型进行测试和Debug的预编译PaddleLite库(full publish模式下编译生成的库) ```