From 93c39779b49a40df5e05c503dba423e381d58daf Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Mon, 16 Nov 2020 10:43:51 +0800
Subject: [PATCH 01/56] open a part of GPU unittest for windows (#28378)

* open a part of GPU unittest for windows

* open a part of GPU unittest for windows
---
 CMakeLists.txt                  |  30 +++--
 cmake/init.cmake                |  10 +-
 paddle/scripts/paddle_build.bat | 223 +++++++++++++++++++++++---------
 3 files changed, 184 insertions(+), 79 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91820123da4831..2faa0a2bbbcb3f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,25 +74,39 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    # windows build turn off warnings.
+    # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+        set(${flag_var} "${${flag_var}} /MP /bigobj")
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
     endforeach(flag_var)
 
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
-    message(STATUS "Using parallel compiling (/MP)")
-    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
-    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
+    foreach(flag_var
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
+        if(${flag_var} MATCHES "/Z[iI]")
+            string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
+
+    foreach(flag_var 
+        CMAKE_STATIC_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS 
+        CMAKE_EXE_LINKER_FLAGS)
+        set(${flag_var} "${${flag_var}} /IGNORE:4006 /IGNORE:4098 /ignore:4049 /IGNORE:4217 /IGNORE:4221")
+        if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES "/INCREMENTAL:NO")
+            string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
+
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 else(WIN32)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
 endif(WIN32)
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 5f36a9adf1ae63..aea02088750df4 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -1,7 +1,7 @@
 # Attention: cmake will append these flags to compile command automatically.
 # So if you want to add global option, change this file rather than flags.cmake
 
-# NOT WIN32
+# Linux
 # DEBUG:  default: "-g"
 # RELEASE:  default: "-O3 -DNDEBUG"
 # RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
@@ -17,6 +17,8 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+else()
+    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
 if(WITH_GPU)
@@ -25,9 +27,3 @@ if(WITH_GPU)
     set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
 endif()
-
-if(WIN32)
-    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG")
-endif()
-
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index d557cad1c4c6fc..450cb7546fd4c3 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -1,3 +1,4 @@
+@ECHO OFF
 rem Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 rem
 rem Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,15 +23,16 @@ setlocal
 rem -------clean up environment-----------
 set work_dir=%cd%
 set cache_dir=%work_dir:Paddle=cache%
+if not exist %cache_dir%\tools (
+    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
+)
 taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
 
 rem ------initialize common variable------
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
 if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
 if not defined WITH_MKL set WITH_MKL=ON
-if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
@@ -60,7 +62,7 @@ setlocal enabledelayedexpansion
 git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
-    git diff HEAD last_pr --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+    git diff HEAD last_pr --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
     if !ERRORLEVEL! EQU 0 (
         rmdir build /s/q
     )
@@ -71,19 +73,19 @@ if %ERRORLEVEL% EQU 0 (
     git branch last_pr
 )
 
-for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
-set day_now=%datetime:~6,2%
-set day_before=-1
-set /p day_before=< %cache_dir%\day.txt
-if %day_now% NEQ %day_before% (
-    echo %day_now% > %cache_dir%\day.txt
-    type %cache_dir%\day.txt
-    rmdir build /s/q
-    goto :mkbuild
-)
+:: for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+:: set day_now=%datetime:~6,2%
+:: set day_before=-1
+:: set /p day_before=< %cache_dir%\day.txt
+:: if %day_now% NEQ %day_before% (
+::     echo %day_now% > %cache_dir%\day.txt
+::     type %cache_dir%\day.txt
+::     rmdir build /s/q
+::     goto :mkbuild
+:: )
 
 :: git diff HEAD origin/develop --stat --name-only
-:: git diff HEAD origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+:: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
 :: if %ERRORLEVEL% EQU 0 (
 ::     rmdir build /s/q
 :: )
@@ -117,13 +119,12 @@ pip install gym --user
 pip install -U -r %work_dir%\python\requirements.txt --user
 pip install -U -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
-    call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install requirements.txt failed!
     exit /b 7
 )
 
 rem ------pre install clcache and init config----------
-pip install clcache
+pip install clcache --user
 :: set USE_CLCACHE to enable clcache
 set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
@@ -133,29 +134,9 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
-rem ------set cache third_party------
-if not exist %cache_dir%\tools (
-    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
-)
-
-if "%WITH_TPCACHE%"=="OFF" (
-    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
-    goto :CASE_%1
-)
-
-echo set -ex > cache.sh
-echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
-echo echo ${md5_content}^>md5.txt >> cache.sh
-
-%cache_dir%\tools\busybox64.exe cat cache.sh
-%cache_dir%\tools\busybox64.exe bash cache.sh
-
-set /p md5=< md5.txt
-if "%WITH_GPU%"=="ON" (
-    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
-) else (
-    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
-)
+rem ------show summary of current environment----------
+python %work_dir%\tools\summary_env.py
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 
 goto :CASE_%1
 
@@ -166,52 +147,88 @@ echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
 exit /b 1
 
 :CASE_wincheck_mkl
+
+rem ------initialize cmake variable for mkl------
 set WITH_MKL=ON
 set WITH_GPU=OFF
 set MSVC_STATIC_CRT=ON
+
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :unit_test || goto unit_test_error
 call :test_inference || goto test_inference_error
-call :check_change_of_unittest || goto check_change_of_unittest_error
+:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 :CASE_wincheck_openblas
-set WITH_MKL=OFF
+
+rem ------initialize cmake variable for openblas------
+set WITH_MKL=ON
 set WITH_GPU=ON
 set MSVC_STATIC_CRT=OFF
 rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
 set WITH_INFERENCE_API_TEST=OFF
+
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :unit_test || goto unit_test_error
 :: call :test_inference || goto test_inference_error
+:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...
 
-
 rem ---------------------------------------------------------------------------------------------
 :cmake
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
+
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
+
+@ECHO ON
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+set PATH=%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%
+set CUDA_PATH=%CUDA_TOOLKIT_ROOT_DIR%
+
+rem ------set third_party cache dir------
+
+if "%WITH_TPCACHE%"=="OFF" (
+    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+    goto :cmake_impl
+)
+
+echo set -ex > cache.sh
+echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
+echo echo ${md5_content}^>md5.txt >> cache.sh
+
+%cache_dir%\tools\busybox64.exe cat cache.sh
+%cache_dir%\tools\busybox64.exe bash cache.sh
+
+set /p md5=< md5.txt
+if "%WITH_GPU%"=="ON" (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
+) else (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
+)
+
+:cmake_impl
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
@@ -224,6 +241,7 @@ exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :build
+@ECHO OFF
 echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
@@ -270,6 +288,7 @@ exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :test_whl_pacakage
+@ECHO OFF
 echo    ========================================
 echo    Step 3. Test pip install whl package ...
 echo    ========================================
@@ -282,7 +301,7 @@ call :timestamp "%start%" "%end%" "Build"
 tree /F %cd%\paddle_inference_install_dir\paddle
 %cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
-@ECHO OFF
+
 for /F %%i in ("%libsize%") do (
     set /a libsize_m=%%i/1024
     echo "Windows Paddle_Inference Size: !libsize_m!M"
@@ -303,17 +322,19 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 1
 )
 
+set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
+::echo 1 > %cache_dir%\error_code.txt
+::type %cache_dir%\error_code.txt
 echo Test import paddle failed, will exit!
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
+@ECHO OFF
 echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
@@ -339,6 +360,7 @@ if %errorlevel%==0 (
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+
 if "%NIGHTLY_MODE%"=="ON" (
     set nightly_label="()"
     ) else (
@@ -348,12 +370,82 @@ if "%NIGHTLY_MODE%"=="ON" (
     echo    ========================================
 )
 
+if "%WITH_GPU%"=="ON" (
+    goto:parallel_test_base_gpu
+) else (
+    goto:parallel_test_base_cpu
+)
+
+:parallel_test_base_gpu
+echo    ========================================
+echo    Running GPU unit tests in parallel way ...
+echo    ========================================
+
+set FLAGS_fraction_of_gpu_memory_to_use=0.75
+
+nvidia-smi -L
+for /F %%# in ('nvidia-smi -L ^| findstr "GPU" /C /I') do set CUDA_DEVICE_COUNT=%%#
+if !errorlevel! NEQ 0 exit /b 8
+
+rem TODO: fix these unittest that is bound to fail
+rem /*==================Disabled Windows==============================*/
+set diable_wingpu_test=tensor_util_test^|lod_tensor_test^|selected_rows_test^|broadcast_op_test^|fused_broadcast_op_test^|assign_op_test^|save_load_op_test^|save_load_combine_op_test^|im2col_test^|^
+beam_search_test^|test_analysis_predictor^|test_model^|test_add_reader_dependency^|test_bilateral_slice_op^|test_buffer_shared_memory_reuse_pass^|test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass^|^
+test_cholesky_op^|test_dataloader_early_reset^|test_dataloader_keep_order^|test_dataloader_unkeep_order^|test_decoupled_py_reader^|test_decoupled_py_reader_data_check^|test_eager_deletion_delete_vars^|^
+test_eager_deletion_while_op^|test_feed_data_check_shape_type^|test_fetch_lod_tensor_array^|test_fetch_unmerged^|test_fleet_base_single^|test_fuse_all_reduce_pass^|test_fuse_elewise_add_act_pass^|^
+test_fuse_optimizer_pass^|test_generator_dataloader^|test_gpu_package_without_gpu_device^|test_ir_memory_optimize_ifelse_op^|test_ir_memory_optimize_nlp^|test_lr_scheduler^|^
+test_multiprocess_dataloader_iterable_dataset_dynamic^|test_multiprocess_dataloader_iterable_dataset_static^|test_nvprof^|test_parallel_dygraph_sync_batch_norm^|test_parallel_executor_drop_scope^|^
+test_parallel_executor_dry_run^|test_parallel_executor_feed_persistable_var^|test_parallel_executor_fetch_isolated_var^|test_parallel_executor_inference_feed_partial_data^|test_parallel_executor_mnist^|^
+test_parallel_executor_seresnext_base_gpu^|test_parallel_executor_seresnext_with_fuse_all_reduce_gpu^|test_parallel_executor_seresnext_with_reduce_gpu^|test_parallel_executor_test_while_train^|^
+test_parallel_ssa_graph_inference_feed_partial_data^|test_partial_eager_deletion_transformer^|test_program_prune_backward^|test_prune^|test_py_reader_combination^|test_py_reader_pin_memory^|^
+test_py_reader_push_pop^|test_py_reader_using_executor^|test_reader_reset^|test_sync_batch_norm_op^|test_update_loss_scaling_op^|test_imperative_static_runner_while^|test_parallel_executor_crf^|^
+test_parallel_executor_profiler^|test_parallel_executor_transformer^|test_parallel_executor_transformer_auto_growth^|test_parallel_executor_seresnext_base_cpu^|test_yolov3^|^
+test_parallel_executor_seresnext_with_reduce_cpu^|test_parallel_executor_seresnext_with_fuse_all_reduce_cpu^|test_flags_use_mkldnn^|test_spawn_and_init_parallel_env^|test_train_recognize_digits^|^
+test_optimizer_in_control_flow^|test_fuse_bn_act_pass^|test_fuse_bn_add_act_pass^|test_activation_mkldnn_op^|test_tsm
+rem /*===============================================================*/
+
+rem these unittest that cost long time, diabled temporarily, greater than 10s
+set long_time_test=test_trilinear_interp_v2_op^|best_fit_allocator_test^|timer_test^|best_fit_allocator_test^|test_image_classification^|test_recognize_digits^|decorator_test^|test_callbacks^|^
+test_dataset_cifar^|test_dataset_imdb^|test_dataset_movielens^|test_datasets^|test_pretrained_model^|test_concat_op^|test_elementwise_add_op^|test_elementwise_sub_op^|test_gather_op^|test_gather_nd_op^|^
+test_sequence_concat^|test_sequence_conv^|test_sequence_pool^|test_sequence_slice_op^|test_space_to_depth_op^|test_activation_nn_grad^|test_activation_op^|test_auto_growth_gpu_memory_limit^|^
+test_bicubic_interp_op^|test_bicubic_interp_v2_op^|test_bilinear_interp_v2_op^|test_conv2d_op^|test_conv3d_op^|test_conv3d_transpose_part2_op^|test_conv_nn_grad^|test_crop_tensor_op^|^
+test_cross_entropy2_op^|test_cross_op^|test_deformable_conv_v1_op^|test_dropout_op^|test_dygraph_multi_forward^|test_elementwise_div_op^|test_elementwise_nn_grad^|test_empty_op^|^
+test_fused_elemwise_activation_op^|test_group_norm_op^|test_gru_op^|test_gru_unit_op^|test_imperative_lod_tensor_to_selected_rows^|test_imperative_optimizer^|test_imperative_ptb_rnn^|^
+test_imperative_save_load^|test_imperative_selected_rows_to_lod_tensor^|test_imperative_star_gan_with_gradient_penalty^|test_imperative_transformer_sorted_gradient^|test_layer_norm_op^|^
+test_lstm_cudnn_op^|test_masked_select_op^|test_matmul_v2_op^|test_multiclass_nms_op^|test_naive_best_fit_gpu_memory_limit^|test_nearest_interp_v2_op^|test_nn_grad^|test_norm_nn_grad^|^
+test_normal^|test_pool3d_op^|test_pool2d_op^|test_prroi_pool_op^|test_regularizer^|test_regularizer_api^|test_sgd_op^|test_softmax_with_cross_entropy_op^|test_static_save_load^|^
+test_trilinear_interp_op^|test_trilinear_interp_v2_op^|test_weight_decay^|test_bilinear_interp_op^|test_nearest_interp_op^|test_sequence_conv^|test_transformer^|test_imperative_out_scale^|^
+test_imperative_qat^|test_imperative_qat_channelwise^|test_quantization_pass^|test_beam_search_decoder^|test_argsort_op^|test_eager_deletion_gru_net^|test_lstmp_op^|test_label_semantic_roles^|^
+test_graph^|test_user_defined_quantization
+
+set /a end=CUDA_DEVICE_COUNT-1
+
+set parallel_test=''
+
+for /L %%# in (0,1,%end%) do (
+    set CUDA_VISIBLE_DEVICES=%%#
+    ctest.exe -I %%#,,%CUDA_DEVICE_COUNT% -R %parallel_test% -E "%disable_ut_quickly%|%diable_wingpu_test%|%long_time_test%" -LE %nightly_label% --output-on-failure -C Release -j 2 --repeat until-pass:4 after-timeout:4
+    if !errorlevel! NEQ 0 exit /b 8
+)
+
+for /L %%# in (0,1,%end%) do (
+    set CUDA_VISIBLE_DEVICES=%%#
+    ctest.exe -I %%#,,%CUDA_DEVICE_COUNT% -E "%disable_ut_quickly%|%parallel_test%|%diable_wingpu_test%|%long_time_test%" -LE %nightly_label% --output-on-failure -C Release -j 1 --repeat until-pass:4 after-timeout:4
+    if !errorlevel! NEQ 0 exit /b 8
+)
+goto:eof
+
+:parallel_test_base_cpu
+echo    ========================================
+echo    Running CPU unit tests in parallel way ...
+echo    ========================================
 ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+
 goto:eof
 
 :unit_test_error
-echo 8 > %cache_dir%\
-type %cache_dir%\error_code.txt
+:: echo 8 > %cache_dir%\error_code.txt
+:: type %cache_dir%\error_code.txt
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "1 card TestCases Total"
@@ -363,6 +455,7 @@ exit /b 8
 
 rem ---------------------------------------------------------------------------------------------
 :test_inference
+@ECHO OFF
 echo    ========================================
 echo    Step 5. Testing fluid library for inference ...
 echo    ========================================
@@ -377,18 +470,18 @@ cd %work_dir%\paddle\fluid\inference\api\demo_ci
 goto:eof
 
 :test_inference_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
+::echo 1 > %cache_dir%\error_code.txt
+::type %cache_dir%\error_code.txt
 echo Testing fluid library for inference failed!
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
+@ECHO OFF
 echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-@ECHO OFF
 cd /d %work_dir%\build
 echo set -e>  check_change_of_unittest.sh
 echo set +x>> check_change_of_unittest.sh
@@ -398,6 +491,7 @@ echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
 echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
 echo     exit 0 >>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
+echo set -x>> check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================ >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
@@ -411,8 +505,8 @@ echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_un
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
 echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
 echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo elif [ "$origin_upstream_url" != "$UPSTREAM_URL" ] \>>  check_change_of_unittest.sh
-echo         ^&^& [ "$origin_upstream_url" != "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
+echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>>  check_change_of_unittest.sh
+echo         ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
 echo     git remote remove upstream>>  check_change_of_unittest.sh
 echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
@@ -422,9 +516,10 @@ echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% >>  check_change_of_unittest.sh
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
@@ -433,10 +528,11 @@ echo EOF>>  check_change_of_unittest.sh
 echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
 echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
-echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
-echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
+echo if [ "$unittest_spec_diff" ^!= "" ]; then>>  check_change_of_unittest.sh
+echo     set +x>> check_change_of_unittest.sh
 echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     if [ "$approval_line" != "" ]; then>>  check_change_of_unittest.sh
+echo     set -x>> check_change_of_unittest.sh
+echo     if [ "$approval_line" ^!= "" ]; then>>  check_change_of_unittest.sh
 echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
 echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
 echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
@@ -458,13 +554,12 @@ echo git checkout -f origin_pr >>  check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
 exit /b 1
 
 
 :timestamp
 setlocal enabledelayedexpansion
+@ECHO OFF
 set start=%~1
 set dd=%start:~2,2%
 set /a dd=100%dd%%%100

From a24d186814f8580fa7faa155bd5db14243fbc68b Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 16 Nov 2020 11:19:28 +0800
Subject: [PATCH 02/56] fix nccl init failed in parallel dygraph mode (#28497)

---
 paddle/fluid/imperative/nccl_context.cc | 34 ++++++++++++--------
 python/paddle/distributed/parallel.py   | 41 ++++++++++++++-----------
 2 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index abee311d08cf38..9c2c9925a34e80 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -49,16 +49,20 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   address.sin_port = htons(port);
 
   int try_times = 0;
+  int retry_time = 0;
   while (true) {
     if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+      retry_time = 3 * (try_times + 1);
       LOG(WARNING) << "Socket bind worker " << ep
-                   << (try_times < 5 ? " failed, try again after 3 seconds."
-                                     : " failed, try again after 3 seconds. "
-                                       "Bind on endpoint %s failed. "
-                                       "Please confirm whether the "
-                                       "communication port or GPU card is "
-                                       "occupied.");
-      std::this_thread::sleep_for(std::chrono::seconds(3));
+                   << (try_times < 9
+                           ? " failed, try again after " +
+                                 std::to_string(retry_time) + " seconds."
+                           : " failed, try again after " +
+                                 std::to_string(retry_time) +
+                                 " seconds. Bind on endpoint " + ep +
+                                 " failed. Please confirm whether the "
+                                 "communication port or GPU card is occupied.");
+      std::this_thread::sleep_for(std::chrono::seconds(retry_time));
       ++try_times;
       continue;
     }
@@ -129,16 +133,20 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   }
 
   int try_times = 0;
+  int retry_time = 0;
   while (true) {
     if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+      retry_time = 3 * (try_times + 1);
       LOG(WARNING)
           << "Socket connect worker " << ep
-          << (try_times < 5
-                  ? " failed, try again after 3 seconds."
-                  : " failed, try again after 3 seconds. Maybe that "
-                    "some process is occupied the GPUs of this node "
-                    "now, and you should kill those process manually.");
-      std::this_thread::sleep_for(std::chrono::seconds(3));
+          << (try_times < 9
+                  ? " failed, try again after " + std::to_string(retry_time) +
+                        " seconds."
+                  : " failed, try again after " + std::to_string(retry_time) +
+                        " seconds. Maybe that some process is occupied the "
+                        "GPUs of this node now, and you should kill those "
+                        "process manually.");
+      std::this_thread::sleep_for(std::chrono::seconds(retry_time));
       ++try_times;
       continue;
     }
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 16b031e116acdc..9b6691dac7545a 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -125,7 +125,7 @@ def _check_var_exists(var_name):
     if ParallelEnv().world_size < 2:
         return
 
-    # 3: init gloo context
+    # 3: init gloo context (step 1: httpsever start)
     ep_rank_0 = ParallelEnv().trainer_endpoints[0].split(":")
     ep_rank = ParallelEnv().trainer_endpoints[ParallelEnv().rank].split(":")
     manager = Manager()
@@ -138,22 +138,6 @@ def _check_var_exists(var_name):
         http_server.daemon = True
         http_server_d["running"] = True
         http_server.start()
-    wait_server_ready([ParallelEnv().trainer_endpoints[0]])
-
-    gloo_strategy = core.GlooParallelStrategy()
-    gloo_strategy.rank = ParallelEnv().rank
-    gloo_strategy.rank_num = ParallelEnv().world_size
-    gloo_strategy.ip_address = ep_rank_0[0]
-    gloo_strategy.ip_port = int(ep_rank_0[1])
-    default_init_timeout_seconds = 3600
-    default_run_timeout_seconds = 9999999
-    gloo_strategy.init_seconds = default_init_timeout_seconds
-    gloo_strategy.run_seconds = default_run_timeout_seconds
-    gloo = core.GlooParallelContext(gloo_strategy)
-    gloo.init()
-    if ParallelEnv().rank == 0:
-        http_server_d["running"] = False
-        http_server.join()
 
     # 4. init NCCL ParallelStrategy
     strategy = ParallelStrategy()
@@ -165,7 +149,7 @@ def _check_var_exists(var_name):
     strategy.current_endpoint = ParallelEnv().current_endpoint
 
     # NOTE(chenweihang): [ why config global place here? ]
-    # the dygraph mode will be set to default mode, 
+    # the dygraph mode will be set to default mode,
     # users will not call `dygraph.guard` or `enable_dygraph`
     # directly, if they want to switch default place,
     # they need to call a function to change default place,
@@ -177,6 +161,27 @@ def _check_var_exists(var_name):
     parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place))
     parallel_helper._init_parallel_ctx()
 
+    # 5: init gloo context (step 2: gloo init)
+    # dividing init_gloo into two part beacause nccl and gloo
+    # are separately looking for free ports which sometimes
+    # leads to port-conflict.
+    wait_server_ready([ParallelEnv().trainer_endpoints[0]])
+
+    gloo_strategy = core.GlooParallelStrategy()
+    gloo_strategy.rank = ParallelEnv().rank
+    gloo_strategy.rank_num = ParallelEnv().world_size
+    gloo_strategy.ip_address = ep_rank_0[0]
+    gloo_strategy.ip_port = int(ep_rank_0[1])
+    default_init_timeout_seconds = 3600
+    default_run_timeout_seconds = 9999999
+    gloo_strategy.init_seconds = default_init_timeout_seconds
+    gloo_strategy.run_seconds = default_run_timeout_seconds
+    gloo = core.GlooParallelContext(gloo_strategy)
+    gloo.init()
+    if ParallelEnv().rank == 0:
+        http_server_d["running"] = False
+        http_server.join()
+
 
 def get_rank():
     """

From 1de3cdd0abd947f2830915e5f2d9bedcb7297c98 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Mon, 16 Nov 2020 11:26:56 +0800
Subject: [PATCH 03/56] Fix summary api for rnn gru lstm (#28566)

* fix summary for rnn gru lstm
---
 python/paddle/hapi/model_summary.py |  3 +++
 python/paddle/tests/test_model.py   | 29 +++++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index c6288ea40c59e7..babbe962a95252 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -244,6 +244,9 @@ def hook(layer, input, output):
             (not (layer == model) or depth < 1)):
 
             hooks.append(layer.register_forward_post_hook(hook))
+        # For rnn, gru and lstm layer
+        elif hasattr(layer, 'could_use_cudnn') and layer.could_use_cudnn:
+            hooks.append(layer.register_forward_post_hook(hook))
 
     if isinstance(input_size, tuple):
         input_size = [input_size]
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index a3b33d6f253be1..ab7a3654e582c9 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -295,6 +295,12 @@ def test_predict_without_inputs(self):
         np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
         fluid.disable_dygraph()
 
+    def test_summary_gpu(self):
+        paddle.disable_static(self.device)
+        rnn = paddle.nn.LSTM(16, 32, 2)
+        params_info = paddle.summary(
+            rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
+
 
 class MyModel(paddle.nn.Layer):
     def __init__(self):
@@ -512,14 +518,33 @@ def _get_param_from_state_dict(state_dict):
             model.summary(input_size=(20), dtype='float32')
 
     def test_summary_nlp(self):
-        paddle.enable_static()
+        def _get_param_from_state_dict(state_dict):
+            params = 0
+            for k, v in state_dict.items():
+                params += np.prod(v.numpy().shape)
+            return params
+
         nlp_net = paddle.nn.GRU(input_size=2,
                                 hidden_size=3,
                                 num_layers=3,
                                 direction="bidirectional")
         paddle.summary(nlp_net, (1, 1, 2))
+
         rnn = paddle.nn.LSTM(16, 32, 2)
-        paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
+        params_info = paddle.summary(
+            rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
+        gt_params = _get_param_from_state_dict(rnn.state_dict())
+        np.testing.assert_allclose(params_info['total_params'], gt_params / 2.0)
+
+        rnn = paddle.nn.GRU(16, 32, 2, direction='bidirectional')
+        params_info = paddle.summary(rnn, (4, 23, 16))
+        gt_params = _get_param_from_state_dict(rnn.state_dict())
+        np.testing.assert_allclose(params_info['total_params'], gt_params / 2.0)
+
+        rnn = paddle.nn.SimpleRNN(16, 32, 2, direction='bidirectional')
+        params_info = paddle.summary(rnn, (4, 23, 16))
+        gt_params = _get_param_from_state_dict(rnn.state_dict())
+        np.testing.assert_allclose(params_info['total_params'], gt_params / 2.0)
 
     def test_summary_dtype(self):
         input_shape = (3, 1)

From 1c3eef4cee16b327c0a305c4eebe6dc369fd1121 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Mon, 16 Nov 2020 11:28:03 +0800
Subject: [PATCH 04/56] Fix vgg error when num_classes is given (#28557)

* fix vgg num classes
---
 python/paddle/tests/test_vision_models.py | 3 +++
 python/paddle/vision/models/vgg.py        | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 5f35a1e0e5a4ba..a25a8f373c29c4 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -71,6 +71,9 @@ def test_resnet101(self):
     def test_resnet152(self):
         self.models_infer('resnet152')
 
+    def test_vgg16_num_classes(self):
+        vgg16 = models.__dict__['vgg16'](pretrained=False, num_classes=10)
+
     def test_lenet(self):
         input = InputSpec([None, 1, 28, 28], 'float32', 'x')
         lenet = paddle.Model(models.__dict__['LeNet'](), input)
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index bb158569d3bc9f..00f6cccbdfe9f1 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -107,10 +107,7 @@ def make_layers(cfg, batch_norm=False):
 
 
 def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
-    model = VGG(make_layers(
-        cfgs[cfg], batch_norm=batch_norm),
-                num_classes=1000,
-                **kwargs)
+    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
 
     if pretrained:
         assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(

From 90805e2df7b6fcd0bf78e8fa10fcbe98ef74c936 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 16 Nov 2020 11:28:52 +0800
Subject: [PATCH 05/56] Register op_version for new attribute use_addto
 (#28463)

* register op_version for addto

* upgrade pass capability

* change eq to le

* change eq to le

* fix merge
---
 .../ir/conv_affine_channel_fuse_pass.cc       |  4 +-
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |  4 +-
 .../ir/conv_elementwise_add2_act_fuse_pass.cc |  4 +-
 .../ir/conv_elementwise_add_act_fuse_pass.cc  |  3 +-
 .../ir/conv_elementwise_add_fuse_pass.cc      |  3 +-
 .../conv_activation_mkldnn_fuse_pass.cc       | 10 +--
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   |  4 +-
 .../conv_concat_relu_mkldnn_fuse_pass.cc      |  4 +-
 .../conv_elementwise_add_mkldnn_fuse_pass.cc  | 71 ++++++++++---------
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc   |  4 +-
 .../ir/quant_conv2d_dequant_fuse_pass.cc      |  5 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  8 ++-
 paddle/fluid/operators/conv_op.cc             | 35 +++++++++
 13 files changed, 106 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 9c984a23e377d7..c0ebf6de9de23b 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -238,11 +238,11 @@ REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("affine_channel", 0));
 REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0)
             .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index a915015bf55bd8..72ac7c3b0e8ab8 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -383,11 +383,11 @@ REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("batch_norm", 0));
 REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0)
             .EQ("batch_norm", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index ad6af69ae02e4f..545beb34e78df5 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -119,7 +121,7 @@ REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0)
             .EQ("relu", 0)
             .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 93e6e13ff7092c..d01a2f2622347c 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -107,7 +108,7 @@ REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0)
             .EQ("relu", 0)
             .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index e4396f227f7f52..e34a2d96581531 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -93,5 +94,5 @@ REGISTER_PASS(conv_elementwise_add_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index c33398553ecd2c..d0bdeb9ad8c460 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -107,7 +109,7 @@ REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_relu_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("relu", 0));
 
 REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
@@ -115,7 +117,7 @@ REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_leaky_relu_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .LE("leaky_relu", 1));
 
 REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
@@ -123,7 +125,7 @@ REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_relu6_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("relu6", 0));
 
 REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
@@ -131,5 +133,5 @@ REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_swish_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("swish", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 716c49dcb12d9b..b0849d74b6153f 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+
 #include <functional>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -150,7 +152,7 @@ REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0));
 
 REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index 76e10212550114..c4d7a12037293e 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -128,6 +130,6 @@ REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_concat_relu_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("concat", 0)
             .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index 2fb131aceaad28..a837b42b3ead48 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+
 #include <functional>
 #include <list>
 #include <map>
 #include <memory>
 #include <tuple>
+
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -226,19 +228,20 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
+  auto get_node_from_elementwise_add =
+      [&elementwise_add_pattern](
+          const GraphPatternDetector::subgraph_t& subgraph)
       -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    return std::make_tuple(elementwise_add_op, elementwise_add_y,
+                           elementwise_add_out);
+  };
 
   return ExecuteHandleOnGraph<IdentityFuseHandle>(
       &gpd, graph_with_stats,
@@ -263,19 +266,20 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       conv_output);
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
+  auto get_node_from_elementwise_add =
+      [&elementwise_add_pattern](
+          const GraphPatternDetector::subgraph_t& subgraph)
       -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    return std::make_tuple(elementwise_add_op, elementwise_add_x,
+                           elementwise_add_out);
+  };
 
   return ExecuteHandleOnGraph<IdentityFuseHandle>(
       &gpd, graph_with_stats,
@@ -302,16 +306,17 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
+  auto get_node_from_elementwise_add =
+      [&elementwise_add_pattern](
+          const GraphPatternDetector::subgraph_t& subgraph)
       -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
 
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
+    return std::make_tuple(elementwise_add_op, elementwise_add_out);
+  };
 
   return ExecuteHandleOnGraph<ProjectionFuseHandle>(
       &gpd, graph_with_stats,
@@ -345,5 +350,5 @@ REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index b2c0afdc754fb7..39f47406a77ca9 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -63,5 +63,5 @@ REGISTER_PASS(depthwise_conv_mkldnn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
 REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
     .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
-            "depthwise_conv2d", 0));
+        paddle::framework::compatible::OpVersionComparatorCombination().LE(
+            "depthwise_conv2d", 1));
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 895c396e1e614f..96c5546d21208b 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
+
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -331,7 +332,7 @@ REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass);
 REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("fc", 0)
             .LE("conv2d_transpose", 1)
             .EQ("fake_quantize_abs_max", 0)
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 08f3d609fa3e6a..bf0d87da91f534 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+
 #include <algorithm>
 #include <map>
 #include <set>
@@ -20,7 +22,6 @@
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
@@ -309,6 +310,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   min_input_shape, max_input_shape, opt_input_shape,
                   disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
+
   trt_engine->SetWithErnie(
       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
       graph->Has(framework::ir::kMultiheadMatmulPass));
@@ -367,13 +369,13 @@ REGISTER_PASS(tensorrt_subgraph_pass,
 REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d", 0)
+            .LE("conv2d", 1)
             .EQ("pool2d", 0)
             .EQ("relu", 0)
             .EQ("softmax", 0)
             .EQ("sigmoid", 0)
             .EQ("hard_swish", 0)
-            .EQ("depthwise_conv2d", 0)
+            .LE("depthwise_conv2d", 1)
             .EQ("batch_norm", 0)
             .EQ("concat", 0)
             .EQ("tanh", 0)
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index ef8a2b38f20b99..76ff1084fa61b4 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -817,3 +819,36 @@ REGISTER_OP_CPU_KERNEL(
     conv3d_grad_grad,
     ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_VERSION(conv2d)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade conv2d, add a new attribute [use_addto].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_addto",
+            "In order to support new feature (inplace addto strategy) for "
+            "gradient accumulation.",
+            false));
+
+REGISTER_OP_VERSION(depthwise_conv2d)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade depthwise_conv2d, add a new attribute [use_addto].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_addto",
+            "In order to support new feature (inplace addto strategy) for "
+            "gradient accumulation.",
+            false));
+
+REGISTER_OP_VERSION(conv3d)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade conv3d, add a new attribute [use_addto].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_addto",
+            "In order to support new feature (inplace addto strategy) for "
+            "gradient accumulation.",
+            false));

From f962bd343217a080c45ace61e104609cc8ea1ffd Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 16 Nov 2020 12:25:59 +0800
Subject: [PATCH 06/56] Fix cudnn workspace limit in cudnn-8 (#28611)

---
 paddle/fluid/operators/conv_cudnn_helper.h | 49 +++++++++++++++++++---
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 55502eaf4e5495..2ba58a6dae5b35 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
@@ -101,6 +102,24 @@ inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
   return max_algos;
 }
 
+template <typename PerfType, typename AlgoType>
+void ChooseAlgoByWorkspace(PerfType* perf_results, size_t perf_num,
+                           size_t workspace_byte, AlgoType* algo) {
+  for (size_t i = 0; i < perf_num; ++i) {
+    auto result = perf_results[i];
+    if (result.status == CUDNN_STATUS_SUCCESS &&
+        result.memory < workspace_byte) {
+      *algo = result.algo;
+      VLOG(3) << "    algo: " << result.algo << ", time: " << result.time
+              << " ms, wksp = " << result.memory
+              << ", status = " << result.status;
+      return;
+    }
+  }
+  VLOG(3) << "Can not find alog that requires memory < "
+          << static_cast<double>(workspace_byte) / (1 << 20) << " MB";
+}
+
 template <typename PerfType, typename AlgoType>
 void ChooseAlgo(const std::vector<PerfType>& perf_results,
                 size_t workspace_byte, AlgoType* algo) {
@@ -219,7 +238,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
       if (workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
-        workspace_size_limit = workspace_size;
+        // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
+        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
+                                              kNUM_CUDNN_FWD_ALGS,
+                                              workspace_size_limit, &algo);
 #else
         VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                    "the workspace size request("
@@ -316,7 +338,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     size_t workspace_size = 0;
     bool has_got_workspace_size = true;
     algo_t algo;
-
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
@@ -362,9 +383,10 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
       if (workspace_size > workspace_size_limit) {
         has_got_workspace_size = false;
 #if CUDNN_VERSION >= 8000
-        // There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8
-        // version.
-        workspace_size_limit = workspace_size;
+        // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
+        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
+                                              kNUM_CUDNN_BWD_DATA_ALGS,
+                                              workspace_size_limit, &algo);
 #else
         VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                    "the workspace size request("
@@ -493,6 +515,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       workspace_size = GetWorkspaceSize(args, algo);
       if (workspace_size > workspace_size_limit) {
         workspace_size = workspace_size_limit;
+#if CUDNN_VERSION >= 8000
+        // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
+        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
+                                              kNUM_CUDNN_BWD_FILTER_ALGS,
+                                              workspace_size_limit, &algo);
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+                args.handle, args.idesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.wdesc.desc(),
+                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(

From 8b97bb2e1f4e8cfe5bee0f97daac266d854b73c4 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 16 Nov 2020 12:42:36 +0800
Subject: [PATCH 07/56] Update cmake for arm ft and fix a bug for Predictor
 dtor. (#28586)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 10 ++++++++--
 python/CMakeLists.txt                            |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 20bea8e568e467..7bfdb2107c9a99 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -175,9 +175,15 @@ bool AnalysisPredictor::PrepareScope(
     status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
+    scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
       delete scope;
-      memory::Release(place_);
+#ifdef PADDLE_WITH_CUDA
+      for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount();
+           ++dev_id) {
+        memory::Release(platform::CUDAPlace(dev_id));
+      }
+#endif
+      memory::Release(platform::CPUPlace());
     });
     status_is_cloned_ = false;
   }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 34edb0280b0ba7..0be09c1ec6340a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -106,7 +106,7 @@ if(APPLE)
     message(FATAL_ERROR "install_name_tool not found, please check.\n")
   endif()
 endif()
-if(LINUX AND NOT WITH_SW)
+if(LINUX AND NOT WITH_SW AND NOT WITH_ARM)
   find_program(PATCHELF_EXECUTABLE patchelf)
   if(NOT PATCHELF_EXECUTABLE)
     message(FATAL_ERROR "patchelf not found, please install it.\n"

From f7dd889ca443aaf1248947a1af65107b9779370d Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 16 Nov 2020 13:52:31 +0800
Subject: [PATCH 08/56] Support squeezed label as input in
 paddle.metric.Accuracy (#28535)

* Support squeezed label as input in paddle.metric.Accuracy
* Revert cifar and fix UT
---
 python/paddle/metric/metrics.py     |  1 +
 python/paddle/tests/test_metrics.py | 28 +++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index fed659562cbb0c..510b99c03008d5 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -244,6 +244,7 @@ def compute(self, pred, label, *args):
             Tensor: Correct mask, a tensor with shape [batch_size, topk].
         """
         pred = paddle.argsort(pred, descending=True)[:, :self.maxk]
+        label = paddle.reshape(label, (-1, 1))
         correct = pred == label
         return paddle.cast(correct, dtype='float32')
 
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
index f05cdf9c6da10b..b1f53168e62cec 100644
--- a/python/paddle/tests/test_metrics.py
+++ b/python/paddle/tests/test_metrics.py
@@ -28,6 +28,7 @@
 def accuracy(pred, label, topk=(1, )):
     maxk = max(topk)
     pred = np.argsort(pred)[:, ::-1][:, :maxk]
+    label = label.reshape(-1, 1)
     correct = (pred == np.repeat(label, maxk, 1))
 
     batch_size = label.shape[0]
@@ -47,13 +48,18 @@ def convert_to_one_hot(y, C):
 
 
 class TestAccuracy(unittest.TestCase):
-    def test_acc(self):
+    def test_acc(self, squeeze_y=False):
         paddle.disable_static()
 
         x = paddle.to_tensor(
             np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2],
                       [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
-        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        y = np.array([[0], [1], [2], [3]])
+        if squeeze_y:
+            y = y.squeeze()
+
+        y = paddle.to_tensor(y)
 
         m = paddle.metric.Accuracy(name='my_acc')
 
@@ -61,7 +67,8 @@ def test_acc(self):
         self.assertEqual(m.name(), ['my_acc'])
 
         correct = m.compute(x, y)
-        # check results
+        # check shape and results
+        self.assertEqual(correct.shape, [4, 1])
         self.assertEqual(m.update(correct), 0.75)
         self.assertEqual(m.accumulate(), 0.75)
 
@@ -80,6 +87,9 @@ def test_acc(self):
         self.assertEqual(m.count[0], 0.0)
         paddle.enable_static()
 
+    def test_1d_label(self):
+        self.test_acc(True)
+
 
 class TestAccuracyDynamic(unittest.TestCase):
     def setUp(self):
@@ -87,12 +97,15 @@ def setUp(self):
         self.class_num = 5
         self.sample_num = 1000
         self.name = None
+        self.squeeze_label = False
 
     def random_pred_label(self):
         label = np.random.randint(0, self.class_num,
                                   (self.sample_num, 1)).astype('int64')
         pred = np.random.randint(0, self.class_num,
                                  (self.sample_num, 1)).astype('int32')
+        if self.squeeze_label:
+            label = label.squeeze()
         pred_one_hot = convert_to_one_hot(pred, self.class_num)
         pred_one_hot = pred_one_hot.astype('float32')
 
@@ -123,9 +136,17 @@ def setUp(self):
         self.class_num = 10
         self.sample_num = 1000
         self.name = "accuracy"
+        self.squeeze_label = True
 
 
 class TestAccuracyStatic(TestAccuracyDynamic):
+    def setUp(self):
+        self.topk = (1, )
+        self.class_num = 5
+        self.sample_num = 1000
+        self.name = None
+        self.squeeze_label = True
+
     def test_main(self):
         main_prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -164,6 +185,7 @@ def setUp(self):
         self.class_num = 10
         self.sample_num = 100
         self.name = "accuracy"
+        self.squeeze_label = False
 
 
 class TestPrecision(unittest.TestCase):

From c4d22c845b951438e7e84311c9e6eefa49c8f526 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 16 Nov 2020 14:29:12 +0800
Subject: [PATCH 09/56] modified timeout value for some ut (#28616)

---
 python/paddle/fluid/tests/book/CMakeLists.txt          |  1 +
 python/paddle/fluid/tests/unittests/CMakeLists.txt     | 10 +++++++++-
 python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt |  6 ++++--
 python/paddle/tests/CMakeLists.txt                     |  2 +-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index e78ba297bf1255..2c816a12bd3ebb 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -12,3 +12,4 @@ set_tests_properties(test_image_classification PROPERTIES TIMEOUT 120)
 set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 120)
 set_tests_properties(test_machine_translation PROPERTIES TIMEOUT 120)
 set_tests_properties(test_rnn_encoder_decoder PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fit_a_line PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6fcc8b9691703c..6e78f7d90149e2 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -648,7 +648,7 @@ set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 120)
 set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 120)
+set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 120)
 set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
@@ -754,6 +754,13 @@ set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT 2
 set_tests_properties(test_imperative_se_resnext PROPERTIES TIMEOUT 200)
 set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_strided_slice_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_translated_layer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_inference_feed_partial_data PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120)
+set_tests_properties(test_mean_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 if(WITH_COVERAGE)
     set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
@@ -776,5 +783,6 @@ endif()
 if(WITH_GPU)
     set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
index 0606594c8c25f3..ffc78d33347b70 100644
--- a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -4,5 +4,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
-set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
-set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
+if(NOT WIN32)
+    set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
+endif()
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 50466be0c1b1f9..b9d05261f1ce02 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -47,6 +47,6 @@ set_tests_properties(test_datasets PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_wmt PROPERTIES TIMEOUT 120)
 set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
-set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150)
 set_tests_properties(test_callbacks PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 

From a3bc3bcd4854057079f2f9447d8872c25ed3af28 Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Mon, 16 Nov 2020 14:32:58 +0800
Subject: [PATCH 10/56] Fix scaled_params append error in AdamW. (#28633)

Fix no_grad setting in AdamW.
test=develop
---
 python/paddle/optimizer/adamw.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 2cf3881d046761..0ffff675903573 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -15,6 +15,7 @@
 from .optimizer import Optimizer
 from .adam import Adam
 from ..fluid import framework
+from ..fluid.dygraph import base as imperative_base
 import paddle
 from paddle.fluid.dygraph.parallel import apply_collective_grads
 
@@ -171,13 +172,14 @@ def _scale_parameters(self, params_and_grads):
                 learning_rate = self._learning_rate()
             with param.block.program._optimized_guard(
                 [param, grad]), framework.name_scope('weight decay'):
+                scaled_params.append(
+                    (param, grad, param * self._coeff * learning_rate))
                 if param.name not in self._params_name:
-                    scaled_params.append(
-                        (param, grad, param * self._coeff * learning_rate))
                     self._params_name.add(param.name)
                     param = param * self._coeff
         return scaled_params
 
+    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -207,6 +209,7 @@ def minimize(self,
         return optimize_ops, params_grads
 
     @framework.dygraph_only
+    @imperative_base.no_grad
     def step(self):
         if paddle.distributed.get_world_size() > 1:
             apply_collective_grads(self._parameter_list)
@@ -227,7 +230,7 @@ def step(self):
                 [param, grad]), framework.name_scope('weight decay'):
                 updated_param = paddle.fluid.layers.elementwise_sub(
                     x=param, y=scaled_param)
-                param.set_value(updated_param.numpy())
+                paddle.fluid.layers.assign(input=updated_param, output=param)
         self._apply_optimize(
             loss=None, startup_program=None, params_grads=params_grads)
 

From 110febdc541db8dd7e75fc3aeb614dff0fede4b7 Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Mon, 16 Nov 2020 14:33:25 +0800
Subject: [PATCH 11/56] Fix gradients with ignore_idx in
 softmax_with_cross_entropy (#28622)

* Fix gradients with ignore_idx in softmax_with_cross_entropy.
test=develop

* Fix gradients with ignore_idx in softmax_with_cross_entropy on cpu.
Remove softmax_with_cross_entropy from op_threshold_white_list.
test=develop

* Fix test_softmax_cross_entropy_op.py.
test=develop
---
 .../operators/softmax_with_cross_entropy_op.cu      | 13 ++++++++++---
 .../fluid/operators/softmax_with_cross_entropy_op.h | 11 +++++++++--
 .../unittests/test_softmax_with_cross_entropy_op.py |  6 +++---
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 3ac7a5a127b379..f86f02544dc980 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -37,11 +37,17 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
 
 template <typename T>
 __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
-                      const int d, const int remain) {
+                      const int d, const int remain, const int64_t* labels,
+                      const int ignore_index) {
   CUDA_KERNEL_LOOP(index, num) {
     int idx_n = index / d;
     int idx_remain = index % remain;
-    logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+    int idx_lbl = idx_n * remain + idx_remain;
+    if (labels[idx_lbl] == ignore_index) {
+      logit_grad[index] = static_cast<T>(0.);
+    } else {
+      logit_grad[index] *= loss_grad[idx_lbl];
+    }
   }
 }
 
@@ -260,6 +266,7 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
     int idx_remain = idx % remain;
     // labels, loss view as [n, remain]
     int idx_lbl = idx_n * remain + idx_remain;
+    // It also would ignore labels not in range(class_num).
     if (idx_axis != labels_[idx_lbl]) {
       log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
     } else {
@@ -513,7 +520,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       int num = n * d;
       grid = (num + block - 1) / block;
       Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain);
+                                           d, remain, label_data, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index cebd466f361d1e..93f2552c3cee90 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -82,6 +82,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     }
 
     const bool soft_label = context.Attr<bool>("soft_label");
+    auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
     const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -115,8 +116,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < remain; j++) {
           int idx = i * remain + j;
-          logit_grad_data[i * d + label_data[idx] * remain + j] -=
-              out_grad_data[idx];
+          if (label_data[idx] == ignore_index) {
+            for (int k = 0; k < axis_dim; ++k) {
+              logit_grad_data[i * d + k * remain + j] = 0;
+            }
+          } else {
+            logit_grad_data[i * d + label_data[idx] * remain + j] -=
+                out_grad_data[idx];
+          }
         }
       }
     }
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index df2a0a523ad1ef..0ee58d5be15e60 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -83,9 +83,9 @@ def setUp(self):
         self.attrs = {
             "numeric_stable_mode": self.numeric_stable_mode,
             "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
         }
-        if self.ignore_index >= 0:
-            self.attrs['ignore_index'] = self.ignore_index
+
         if self.axis != -1:
             self.attrs['axis'] = self.axis
 
@@ -93,7 +93,7 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss", max_relative_error=5e-5)
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):

From cf2c42a937137a4c6d0468ba497dac3f41e010b7 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 16 Nov 2020 14:36:35 +0800
Subject: [PATCH 12/56] fix exec nightly error on mac (#28567)

---
 paddle/scripts/paddle_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 14bd5a7ae89326..4c74653b7a06aa 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -563,12 +563,12 @@ EOF
         if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
             nightly_label=""
         else
-            nightly_label="RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY"
+            nightly_label="(RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
             echo "========================================="
             echo "Unittests with nightly labels  are only run at night"
             echo "========================================="
         fi
-        ctest -E "($disable_ut_quickly)" -LE "($nightly_label)" --output-on-failure -j $2 | tee $tmpfile
+        ctest -E "($disable_ut_quickly)" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
         failed_test_lists=''
         collect_failed_tests
         mactest_error=0

From 2b1e7e5b02f6f63f6eee6b0e7dd64a8649d1a2c7 Mon Sep 17 00:00:00 2001
From: GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
Date: Mon, 16 Nov 2020 15:24:37 +0800
Subject: [PATCH 13/56] Polish where english doc (#28595)

---
 python/paddle/tensor/search.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 3da4228fc8b204..f5e0dc4c05bfb6 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -495,9 +495,6 @@ def sort(x, axis=-1, descending=False, name=None):
 
 def where(condition, x, y, name=None):
     """
-	:alias_main: paddle.where
-	:alias: paddle.where,paddle.tensor.where,paddle.tensor.search.where
-
     Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
 
     .. math::
@@ -510,28 +507,27 @@ def where(condition, x, y, name=None):
 
 
     Args:
-        condition(Variable): The condition to choose x or y.
-        x(Variable): x is a Tensor Variable with data type float32, float64, int32, int64.
-        y(Variable): y is a Tensor Variable with data type float32, float64, int32, int64.
+        condition(Tensor): The condition to choose x or y.
+        x(Tensor): x is a Tensor with data type float32, float64, int32, int64.
+        y(Tensor): y is a Tensor with data type float32, float64, int32, int64.
 
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with the same data dype as x. 
+        Tensor: A Tensor with the same data dype as x. 
 
     Examples:
         .. code-block:: python
 
           import paddle
 
-          paddle.disable_static()
           x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
           y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
           out = paddle.where(x>1, x, y)
 
-          print(out.numpy())
+          print(out)
           #out: [1.0, 1.0, 3.2, 1.2]
     """
     if not in_dygraph_mode():

From c5c273c13e861c2f22ab6c639d2cafa2facfeb8c Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Mon, 16 Nov 2020 15:26:35 +0800
Subject: [PATCH 14/56] [Dy2stat] Fix Using Tuple for Transpose in Dy2stat
 (#28574)

PaddleSeg uses tuple as parameter of transpose in dygraph code:
https://github.com/PaddlePaddle/PaddleSeg/blob/release/v0.7.0/dygraph/paddleseg/models/danet.py#L152

However, in dy2stat, static code doesn't support the perm as a tuple. This PR fixed it.
---
 python/paddle/fluid/layers/nn.py              | 21 ++++++-----
 .../tests/unittests/test_transpose_op.py      | 35 +++++++++++++++++++
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 760f5ce58bf268..3ac43df872e377 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5459,7 +5459,7 @@ def transpose(x, perm, name=None):
 
     Args:
         x (Variable): The input Tensor. It is a N-D Tensor of data types float32, float64, int32.
-        perm (list): Permute the input according to the data of perm.
+        perm (list|tuple): Permute the input according to the data of perm.
         name (str): The name of this layer. It is optional.
 
     Returns:
@@ -5492,14 +5492,12 @@ def transpose(x, perm, name=None):
 
         .. code-block:: python
 
-            # use append_batch_size=False to avoid prepending extra
-            # batch size in shape
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 3, 4],
-                            dtype='float32', append_batch_size=False)
-            x_transposed = fluid.layers.transpose(x, perm=[1, 0, 2])
-            print x_transposed.shape
-            #(3L, 2L, 4L)
+            import paddle
+
+            x = paddle.randn([2, 3, 4])
+            x_transposed = paddle.transpose(x, perm=[1, 0, 2])
+            print(x_transposed.shape)
+            # [3L, 2L, 4L]
 
     """
     if in_dygraph_mode():
@@ -5509,8 +5507,9 @@ def transpose(x, perm, name=None):
     check_variable_and_dtype(
         x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'transpose')
-    check_type(perm, 'perm', list, 'transpose')
-
+    check_type(perm, 'perm', (list, tuple), 'transpose')
+    if isinstance(perm, tuple):
+        perm = list(perm)
     if len(perm) != len(x.shape):
         raise ValueError(
             "Input(perm) is the permutation of dimensions of Input(x), "
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 56333211469db5..f72df8cbe46409 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -21,6 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
+paddle.enable_static()
 
 class TestTransposeOp(OpTest):
     def setUp(self):
@@ -113,6 +114,7 @@ def initTestCase(self):
 
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[10, 5, 3], dtype='float64')
 
@@ -149,6 +151,39 @@ def test_each_elem_value_check():
 
             self.assertRaises(ValueError, test_each_elem_value_check)
 
+class TestTransposeApi(unittest.TestCase):
+    def test_static_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4], dtype='float32')
+            x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+            x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.random.random([2, 3, 4]).astype("float32")
+            result1, result2 = exe.run(feed={"x": x_np}, fetch_list=[x_trans1, x_trans2])
+            expected_result1 = np.transpose(x_np, [1, 0, 2])
+            expected_result2 = np.transpose(x_np, (2, 1, 0))
+            
+            np.testing.assert_array_equal(result1, expected_result1)
+            np.testing.assert_array_equal(result2, expected_result2)
+
+    def test_dygraph_out(self):
+        # This is an old test before 2.0 API so we need to disable static
+        # to trigger dygraph
+        paddle.disable_static()
+        x = paddle.randn([2, 3, 4])
+        x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+        x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+        x_np = x.numpy()
+        expected_result1 = np.transpose(x_np, [1, 0, 2])
+        expected_result2 = np.transpose(x_np, (2, 1, 0))
+
+        np.testing.assert_array_equal(x_trans1.numpy(), expected_result1)
+        np.testing.assert_array_equal(x_trans2.numpy(), expected_result2)
+        # This is an old test before 2.0 API so we enable static again after
+        # dygraph test
+        paddle.enable_static()
 
 class TestTAPI(unittest.TestCase):
     def test_out(self):

From 89d27de90fe97abe9f1e5e12a8c42895ba2b699e Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 16 Nov 2020 16:23:01 +0800
Subject: [PATCH 15/56] DataLoader support not auto collate batch (#28425)

* DataLoader support not auto collate batch. test=develop
---
 .../fluid/dataloader/dataloader_iter.py       | 34 +++++---
 python/paddle/fluid/dataloader/fetcher.py     | 49 +++++++-----
 python/paddle/fluid/reader.py                 | 36 ++++++++-
 .../test_multiprocess_dataloader_dynamic.py   | 45 ++++++++++-
 .../test_multiprocess_dataloader_exception.py |  4 +-
 ...ess_dataloader_iterable_dataset_dynamic.py | 43 ++++++++++-
 ...cess_dataloader_iterable_dataset_static.py | 75 ++++++++++++++++++
 .../test_multiprocess_dataloader_static.py    | 77 +++++++++++++++++++
 8 files changed, 327 insertions(+), 36 deletions(-)

diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index d32a543eb495fa..ee30484ae9a0fb 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -36,6 +36,7 @@
 from ..framework import in_dygraph_mode
 from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
+from .batch_sampler import _InfiniteIterableSampler
 
 __all__ = ['get_worker_info']
 
@@ -100,11 +101,13 @@ class _DatasetKind(object):
     ITER = 1
 
     @staticmethod
-    def create_fetcher(kind, dataset, collate_fn, drop_last):
+    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, drop_last):
         if kind == _DatasetKind.MAP:
-            return _MapDatasetFetcher(dataset, collate_fn, drop_last)
+            return _MapDatasetFetcher(dataset, auto_collate_batch,
+                                      collate_fn, drop_last)
         elif kind == _DatasetKind.ITER:
-            return _IterableDatasetFetcher(dataset, collate_fn, drop_last)
+            return _IterableDatasetFetcher(dataset, auto_collate_batch,
+                                           collate_fn, drop_last)
         else:
             raise NotImplementedError("unknown Dataset kind {}".format(kind))
 
@@ -221,8 +224,7 @@ def __init__(self, loader):
         self._places = loader.places
         self._return_list = loader.return_list
         self._batch_sampler = loader.batch_sampler
-        self._sampler_iter = iter(loader.batch_sampler)
-        self._collate_fn = loader.collate_fn or default_collate_fn
+        self._auto_collate_batch = loader.auto_collate_batch
         self._num_workers = loader.num_workers
         self._use_buffer_reader = loader.use_buffer_reader
         self._use_shared_memory = loader.use_shared_memory
@@ -231,6 +233,16 @@ def __init__(self, loader):
         self._dataset_kind = loader.dataset_kind
         self._pin_memory = loader.pin_memory
 
+        if self._auto_collate_batch:
+            self._sampler_iter = iter(loader.batch_sampler)
+            self._collate_fn = loader.collate_fn or default_collate_fn
+        else:
+            if self._dataset_kind == _DatasetKind.MAP:
+                self._sampler_iter = iter(list(range(len(self._dataset))))
+            else:
+                self._sampler_iter = iter(_InfiniteIterableSampler(self._dataset, 1))
+            self._collate_fn = loader.collate_fn
+
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
         # to put mini-batch data to self._blocking_queue, mini-batch data
         # will be get from:
@@ -257,7 +269,8 @@ def __init__(self, loader):
         super(_DataLoaderIterSingleProcess, self).__init__(loader)
 
         self._dataset_fetcher = _DatasetKind.create_fetcher(
-            self._dataset_kind, self._dataset, self._collate_fn, True)
+            self._dataset_kind, self._dataset, self._auto_collate_batch,
+            self._collate_fn, True)
 
         # NOTE: len(self._places) batch data compose as an output
         # iteration, set blocking_queue can cache 2 iteration datas
@@ -367,7 +380,7 @@ def __del__(self):
 
 # NOTE(chenweihang): _worker_loop must be top level method to be pickled
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
-                 collate_fn, init_fn, worker_id, num_workers,
+                 auto_collate_batch, collate_fn, init_fn, worker_id, num_workers,
                  use_shared_memory):
     try:
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
@@ -388,7 +401,7 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
             if init_fn is not None:
                 init_fn(worker_id)
             fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                                  collate_fn, True)
+                                    auto_collate_batch, collate_fn, True)
         except:
             init_exception = Exception("init_fn failed in worker {}: " \
                                     "{}".format(worker_id, sys.exc_info()))
@@ -511,8 +524,9 @@ def _init_workers(self):
                 target=_worker_loop,
                 args=(self._dataset, self._dataset_kind, indices_queue,
                       self._data_queue, self._workers_done_event,
-                      self._collate_fn, self._worker_init_fn, i,
-                      self._num_workers, self._use_shared_memory))
+                      self._auto_collate_batch, self._collate_fn,
+                      self._worker_init_fn, i, self._num_workers,
+                      self._use_shared_memory))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index 001b8b931da233..9382a704223704 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -14,8 +14,9 @@
 
 
 class _DatasetFetcher(object):
-    def __init__(self, dataset, collate_fn, drop_last):
+    def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
         self.dataset = dataset
+        self.auto_collate_batch = auto_collate_batch
         self.collate_fn = collate_fn
         self.drop_last = drop_last
 
@@ -25,29 +26,41 @@ def fetch(self, batch_indices):
 
 
 class _IterableDatasetFetcher(_DatasetFetcher):
-    def __init__(self, dataset, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(dataset, collate_fn,
-                                                      drop_last)
+    def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
+        super(_IterableDatasetFetcher, self).__init__(dataset, auto_collate_batch,
+                                              collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
     def fetch(self, batch_indices):
-        data = []
-        for _ in batch_indices:
-            try:
-                data.append(next(self.dataset_iter))
-            except StopIteration:
-                break
-        if len(data) == 0 or (self.drop_last and
-                              len(data) < len(batch_indices)):
-            raise StopIteration
 
-        return self.collate_fn(data)
+        if self.auto_collate_batch:
+            data = []
+            for _ in batch_indices:
+                try:
+                    data.append(next(self.dataset_iter))
+                except StopIteration:
+                    break
+            if len(data) == 0 or (self.drop_last and
+                                  len(data) < len(batch_indices)):
+                raise StopIteration
+        else:
+            data = next(self.dataset_iter)
+
+        if self.collate_fn:
+            data = self.collate_fn(data)
+        return data
 
 
 class _MapDatasetFetcher(_DatasetFetcher):
-    def __init__(self, dataset, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(dataset, collate_fn, drop_last)
+    def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
+        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, collate_fn, drop_last)
 
     def fetch(self, batch_indices):
-        data = [self.dataset[idx] for idx in batch_indices]
-        return self.collate_fn(data)
+        if self.auto_collate_batch:
+            data = [self.dataset[idx] for idx in batch_indices]
+        else:
+            data = self.dataset[batch_indices]
+
+        if self.collate_fn:
+            data = self.collate_fn(data)
+        return data
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 0e7fd35f5842e6..4a50b3bc0c7dc5 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -163,6 +163,21 @@ class DataLoader(object):
 
     For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
 
+    **Disable automatic batching**
+
+    In certain cases such as some NLP tasks, instead of automatic batching,
+    handling batching manually in dataset is needed by users. For these
+    cases, automatic batching is disabled if both :attr:`batch_size` and
+    :attr:`batch_sampler` is set as None, each data got from :attr:`dataset`
+    should be batched data and will be processed with function define by
+    :attr:`collate_fn` or :attr:`default_collate_fn`.
+
+
+    .. note::
+        When automatic batching is disabled, :attr:`default_collate_fn` will
+        do nothing to data from dataset.
+
+
     Args:  
         dataset(Dataset): the dataset to load data from, should be an
             instance of subclass of :code:`paddle.io.Dataset` or
@@ -185,7 +200,7 @@ class DataLoader(object):
         batch_sampler(BatchSampler): an instance of `paddle.io.BatchSampler`
             to generate batch indices to draw samples from :attr:`dataset`
             and combine a batch. Default None.
-        batch_size(int): sample number in a mini-batch, a substitution
+        batch_size(int|None): sample number in a mini-batch, a substitution
             parameter for :attr:`batch_sampler`, if :attr:`batch_sampler`
             is not set, a default `paddle.io.BatchSampler` will be used
             and initialize by :attr:`batch_size`, :attr:`shuffle` and
@@ -358,10 +373,15 @@ def __init__(self,
                 "batch_size/shuffle/drop_last should not be set when " \
                 "batch_sampler is given"
             self.batch_sampler = batch_sampler
+            self.batch_size = None
+        elif batch_size is None:
+            self.batch_sampler = None
+            self.batch_size = None
         else:
-            assert batch_size is not None and batch_size > 0, \
-                "batch_size should be a positive value when " \
+            assert batch_size > 0, \
+                "batch_size should be None or a positive value when " \
                 "batch_sampler is not given"
+            self.batch_size = batch_size
             if isinstance(dataset, IterableDataset):
                 self.batch_sampler = _InfiniteIterableSampler(dataset,
                                                               batch_size)
@@ -372,13 +392,21 @@ def __init__(self,
                     shuffle=shuffle,
                     drop_last=drop_last)
 
+        self.auto_collate_batch = self.batch_sampler is not None
+
         self.pin_memory = False
         if in_dygraph_mode():
             self.pin_memory = True if use_pinned_memory(
             ) is None else use_pinned_memory()
 
     def __len__(self):
-        return len(self.batch_sampler)
+        if self.dataset_kind == _DatasetKind.ITER:
+            raise ValueError("length of IterableDataset not supported")
+        else:
+            if self.batch_size is None:
+                return len(self.dataset)
+            else:
+                return len(self.batch_sampler)
 
     def __iter__(self):
         if self.num_workers == 0:
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index 1bb720673e4f33..c89354adf751c6 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -27,7 +27,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 
-from test_multiprocess_dataloader_static import RandomDataset, prepare_places
+from test_multiprocess_dataloader_static import RandomDataset, RandomBatchedDataset, prepare_places
 from test_multiprocess_dataloader_static import EPOCH_NUM, BATCH_SIZE, IMAGE_SIZE, SAMPLE_NUM, CLASS_NUM
 
 
@@ -122,5 +122,48 @@ def test_main(self):
             self.assertLess(diff, 1e-2)
 
 
+class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        with fluid.dygraph.guard(places[0]):
+            fc_net = SimpleFCNet()
+            optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
+
+            dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                num_workers=num_workers,
+                batch_size=None,
+                drop_last=True)
+            assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for _ in six.moves.range(EPOCH_NUM):
+                step = 0
+                for image, label in dataloader():
+                    out = fc_net(image)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    fc_net.clear_gradients()
+
+                    loss_list.append(np.mean(avg_loss.numpy()))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 6fd14b40bc9108..74fe359cd7d597 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -188,7 +188,7 @@ def _collate_fn(sample_list):
                 indices_queue.put(None)
                 _worker_loop(loader._dataset, 0, indices_queue,
                              loader._data_queue, loader._workers_done_event,
-                             _collate_fn, _init_fn, 0, 1,
+                             True, _collate_fn, _init_fn, 0, 1,
                              loader._use_shared_memory)
                 self.assertTrue(False)
         except AssertionError:
@@ -232,7 +232,7 @@ def _collate_fn(sample_list):
                 loader._workers_done_event.set()
                 _worker_loop(loader._dataset, 0, indices_queue,
                              loader._data_queue, loader._workers_done_event,
-                             _collate_fn, _init_fn, 0, 1,
+                             True, _collate_fn, _init_fn, 0, 1,
                              loader._use_shared_memory)
                 self.assertTrue(True)
         except AssertionError:
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index af332d8e432092..0533a0d09fa0de 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -27,7 +27,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 
-from test_multiprocess_dataloader_iterable_dataset_static import RandomDataset, prepare_places
+from test_multiprocess_dataloader_iterable_dataset_static import RandomDataset, RandomBatchedDataset, prepare_places
 from test_multiprocess_dataloader_iterable_dataset_static import EPOCH_NUM, BATCH_SIZE, IMAGE_SIZE, SAMPLE_NUM, CLASS_NUM
 
 
@@ -119,5 +119,46 @@ def test_main(self):
                 0]
 
 
+class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        with fluid.dygraph.guard(places[0]):
+            fc_net = SimpleFCNet()
+            optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
+
+            dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                num_workers=num_workers,
+                batch_size=None,
+                drop_last=True)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for _ in six.moves.range(EPOCH_NUM):
+                step = 0
+                for image, label in dataloader():
+                    out = fc_net(image)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    fc_net.clear_gradients()
+
+                    loss_list.append(np.mean(avg_loss.numpy()))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
index e64e11d156ec74..4615bf85ce69f4 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -167,5 +167,80 @@ def test_main(self):
                 0]
 
 
+class RandomBatchedDataset(IterableDataset):
+    def __init__(self, sample_num, class_num):
+        self.sample_num = sample_num // BATCH_SIZE
+        self.class_num = class_num
+
+    def __iter__(self):
+        for i in range(self.sample_num):
+            np.random.seed(i)
+            images = []
+            labels = []
+            for _ in range(BATCH_SIZE):
+                image = np.random.random([IMAGE_SIZE]).astype('float32')
+                label = np.random.randint(0, self.class_num - 1,
+                                          (1, )).astype('int64')
+                images.append(image)
+                labels.append(label)
+            yield np.stack(images, axis=0), np.stack(labels, axis=0)
+
+
+class TestStaticDataLoaderWithBatchedDataset(TestStaticDataLoader):
+    def run_main(self, num_workers, places):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, image, label, loss = simple_fc_net_static()
+
+            dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                places=places,
+                num_workers=num_workers,
+                batch_size=None,
+                drop_last=True)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if len(places) > 1:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for i in six.moves.range(EPOCH_NUM):
+                step = 0
+                for d in dataloader:
+                    assert len(d) == len(places), "{} != {}".format(
+                        len(d), len(places))
+                    for i, item in enumerate(d):
+                        image = item['image']
+                        label = item['label']
+                        assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
+                        assert label.shape() == [BATCH_SIZE, 1]
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
+                    L, = exe.run(program=prog,
+                                 feed=d,
+                                 fetch_list=[loss],
+                                 use_program_cache=True)
+                    loss_list.append(np.mean(L))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index c01e2e75b8195c..5ec907c290b946 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -215,5 +215,82 @@ def test_multi_place(self):
                 assert isinstance(d[1], list)
 
 
+class RandomBatchedDataset(Dataset):
+    def __init__(self, sample_num, class_num):
+        self.sample_num = int(sample_num / BATCH_SIZE)
+        self.class_num = class_num
+
+    def __getitem__(self, idx):
+        np.random.seed(idx)
+        images = []
+        labels = []
+        for _ in range(BATCH_SIZE):
+            image = np.random.random([IMAGE_SIZE]).astype('float32')
+            label = np.random.randint(0, self.class_num - 1, (1, )).astype('int64')
+            images.append(image)
+            labels.append(label)
+        return np.stack(images, axis=0), np.stack(labels, axis=0)
+
+    def __len__(self):
+        return self.sample_num
+
+
+class TestStaticDataLoaderWithBatchedDataset(TestStaticDataLoader):
+    def run_main(self, num_workers, places):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, image, label, loss = simple_fc_net_static()
+
+            dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                places=places,
+                num_workers=num_workers,
+                batch_size=None,
+                drop_last=True)
+            assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if len(places) > 1:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for _ in six.moves.range(EPOCH_NUM):
+                step = 0
+                for d in dataloader:
+                    assert len(d) == len(places), "{} != {}".format(
+                        len(d), len(places))
+                    for i, item in enumerate(d):
+                        image = item['image']
+                        label = item['label']
+                        assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
+                        assert label.shape() == [BATCH_SIZE, 1]
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
+                    L, = exe.run(program=prog,
+                                 feed=d,
+                                 fetch_list=[loss],
+                                 use_program_cache=True)
+                    loss_list.append(np.mean(L))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+
 if __name__ == '__main__':
     unittest.main()

From b889a0cee25ad81f02e740f1f2942fecfca8e11b Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 16 Nov 2020 16:45:45 +0800
Subject: [PATCH 16/56] add gaussian_random op_version (#28602)

---
 paddle/fluid/operators/gaussian_random_op.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 17a71c67b8a084..fd2f48265ca6f4 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -197,3 +198,19 @@ REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
+REGISTER_OP_VERSION(gaussian_random)
+    .AddCheckpoint(
+        R"ROC(
+               Upgrade gaussian_random add new inputs [ShapeTensor] and [ShapeTensorList] 
+               and modify the attribute of [shape])ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("ShapeTensor",
+                      "The output shape supports Tensor type. ShapeTensor is "
+                      "dispensable.")
+            .NewInput("ShapeTensorList",
+                      "The output shape supports list filled with Tensor. "
+                      "ShapeTensorList is dispensable.")
+            .ModifyAttr(
+                "shape",
+                "Add the default value of shape, the default value is {}.",
+                {}));

From 72e068f1ba158fdd67511193f8fa567d6d791a8a Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 16 Nov 2020 18:06:39 +0800
Subject: [PATCH 17/56] fix test_multinomial (#28558)

* fix test_multinomial

* fix test_multinomial add 0 prob
---
 .../tests/unittests/test_multinomial_op.py    | 64 +++++++++----------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index b22f6b80df79a0..957c06eca89c38 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -22,6 +22,26 @@
 import numpy as np
 
 
+def sample_output_one_dimension(out, dim):
+    # count numbers of different categories
+    sample_prob = np.zeros(dim).astype("float32")
+    sample_index_prob = np.unique(out, return_counts=True)
+    sample_prob[sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum()
+    return sample_prob
+
+
+def sample_output_two_dimension(out, shape):
+    num_dist = shape[0]
+    out_list = np.split(out, num_dist, axis=0)
+    sample_prob = np.zeros(shape).astype("float32")
+    for i in range(num_dist):
+        sample_index_prob = np.unique(out_list[i], return_counts=True)
+        sample_prob[i][sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum(axis=-1, keepdims=True)
+    return sample_prob
+
+
 class TestMultinomialOp(OpTest):
     def setUp(self):
         paddle.enable_static()
@@ -39,10 +59,7 @@ def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
     def sample_output(self, out):
-        # count numbers of different categories
-        sample_prob = np.unique(out, return_counts=True)[1].astype("float32")
-        sample_prob /= sample_prob.sum()
-        return sample_prob
+        return sample_output_one_dimension(out, 4)
 
     def verify_output(self, outs):
         # normalize the input to get the probability
@@ -62,14 +79,7 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def sample_output(self, out):
-        out_list = np.split(out, 3, axis=0)
-        count_array = [0] * 3
-        for i in range(3):
-            count_array[i] = np.unique(
-                out_list[i], return_counts=True)[1].astype("float32")
-        sample_prob = np.stack(count_array, axis=0)
-        sample_prob /= sample_prob.sum(axis=-1, keepdims=True)
-        return sample_prob
+        return sample_output_two_dimension(out, [3, 4])
 
 
 class TestMultinomialOp3(TestMultinomialOp):
@@ -91,15 +101,12 @@ class TestMultinomialApi(unittest.TestCase):
     def test_dygraph(self):
         # input probability is a vector, and replacement is True
         paddle.disable_static()
-        x = paddle.rand([4])
+        x_numpy = np.random.rand(4)
+        x = paddle.to_tensor(x_numpy)
         out = paddle.multinomial(x, num_samples=100000, replacement=True)
-        x_numpy = x.numpy()
         paddle.enable_static()
 
-        sample_prob = np.unique(
-            out.numpy(), return_counts=True)[1].astype("float32")
-        sample_prob /= sample_prob.sum()
-
+        sample_prob = sample_output_one_dimension(out.numpy(), 4)
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
             np.allclose(
@@ -109,18 +116,11 @@ def test_dygraph(self):
     def test_dygraph2(self):
         # input probability is a matrix, and replacement is True
         paddle.disable_static()
-        x = paddle.rand([3, 4])
+        x_numpy = np.random.rand(3, 4)
+        x = paddle.to_tensor(x_numpy)
         out = paddle.multinomial(x, num_samples=100000, replacement=True)
-        x_numpy = x.numpy()
-
-        out_list = np.split(out.numpy(), 3, axis=0)
-        count_array = [0] * 3
-        for i in range(3):
-            count_array[i] = np.unique(
-                out_list[i], return_counts=True)[1].astype("float32")
-        sample_prob = np.stack(count_array, axis=0)
-        sample_prob /= sample_prob.sum(axis=-1, keepdims=True)
 
+        sample_prob = sample_output_two_dimension(out.numpy(), [3, 4])
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
             np.allclose(
@@ -131,9 +131,9 @@ def test_dygraph2(self):
     def test_dygraph3(self):
         # replacement is False. number of samples must be less than number of categories.
         paddle.disable_static()
-        x = paddle.rand([1000])
+        x_numpy = np.random.rand(1000)
+        x = paddle.to_tensor(x_numpy)
         out = paddle.multinomial(x, num_samples=100, replacement=False)
-        x_numpy = x.numpy()
 
         unique_out = np.unique(out.numpy())
         self.assertEqual(
@@ -158,9 +158,7 @@ def test_static(self):
         x_np = np.random.rand(4).astype('float32')
         out = exe.run(train_program, feed={'x': x_np}, fetch_list=[out])
 
-        sample_prob = np.unique(out, return_counts=True)[1].astype("float32")
-        sample_prob /= sample_prob.sum()
-
+        sample_prob = sample_output_one_dimension(out, 4)
         prob = x_np / x_np.sum(axis=-1, keepdims=True)
         self.assertTrue(
             np.allclose(

From 804271cff9f43cd06409962b3bef80827374fa25 Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Mon, 16 Nov 2020 11:42:01 +0100
Subject: [PATCH 18/56] Op version python mkldnn_inplace test  (#28354)

* add mkldnn inplace op version test

* update mkldnn_inplace fuse pass

* update the inplace test
---
 .../ir/mkldnn/mkldnn_inplace_pass.cc          |  8 +++
 .../test_mkldnn_inplace_fuse_pass.py          | 56 +++++++++++++++++++
 tools/static_mode_white_list.py               |  1 +
 3 files changed, 65 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
index 7bd94bf55ea21f..d655837f743369 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
@@ -17,10 +17,12 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -215,3 +217,9 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace paddle
 
 REGISTER_PASS(mkldnn_inplace_pass, paddle::framework::ir::MKLDNNInPlacePass);
+REGISTER_PASS_CAPABILITY(mkldnn_inplace_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("softmax", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("tanh", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
new file mode 100644
index 00000000000000..4215e56de2cc73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class MkldnnInplacePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            paddle.enable_static()
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_out_1 = fluid.layers.conv2d(
+                data, num_filters=3, filter_size=3, bias_attr=False)
+            softmax_out = fluid.layers.softmax(conv_out_1)
+            relu_out = fluid.layers.relu(conv_out_1)
+            eltwise_out = fluid.layers.elementwise_add(
+                softmax_out, relu_out, axis=-1)
+
+        self.pass_name = 'mkldnn_inplace_pass'
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [softmax_out, relu_out, eltwise_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6a2a121cd616f9..1f153442aff6c6 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -611,6 +611,7 @@
     'test_mkldnn_matmul_op_output_fuse_pass',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_scale_matmul_fuse_pass',
+    'test_mkldnn_inplace_fuse_pass',
     'test_batch_fc_op',
     'test_c_comm_init_all_op',
     'test_conv2d_fusion_op',

From 2cb71c0cde2835e4f7e0d6862f49ab1c56f029c4 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Mon, 16 Nov 2020 11:43:33 +0100
Subject: [PATCH 19/56] Add checkpoint to quantize (#28612)

* Add checkpoint to quantize

* Change bfloat16 option
---
 .../framework/ir/mkldnn/cpu_bfloat16_pass.cc  |  6 ++
 paddle/fluid/operators/quantize_op.cc         |  8 +++
 .../ir/inference/inference_pass_test.py       |  5 ++
 .../test_mkldnn_cpu_bfloat16_pass.py          | 58 +++++++++++++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index df498865245fc8..ae93025e784e38 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -157,3 +158,8 @@ void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace paddle
 
 REGISTER_PASS(cpu_bfloat16_pass, paddle::framework::ir::CPUBFloat16Pass);
+
+REGISTER_PASS_CAPABILITY(cpu_bfloat16_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().GE(
+            "quantize", 1));
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index ee5829319d2a62..f21243de834177 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/quantize_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -54,3 +55,10 @@ void QuantOpMaker::Make() {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker);
+
+REGISTER_OP_VERSION(quantize)
+    .AddCheckpoint(
+        R"ROC( Add a new attribute [bfloat16])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "bfloat16", "If true, float32 input is converted to bfloat16",
+            false));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 0209bb344ece7c..18715f10c5cd36 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -43,6 +43,7 @@ def __init__(self, methodName='runTest'):
         self.fetch_list = None
 
         self.enable_mkldnn = False
+        self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
         self.trt_parameters = None
         self.enable_lite = False
@@ -125,6 +126,8 @@ def _get_analysis_config(self,
                     self.trt_parameters.use_calib_mode)
         elif use_mkldnn:
             config.enable_mkldnn()
+            if self.enable_mkldnn_bfloat16:
+                config.enable_mkldnn_bfloat16()
 
         return config
 
@@ -251,6 +254,8 @@ def check_output_with_option(self,
                 len(outs) == len(mkldnn_outputs),
                 "The number of outputs is different between CPU and MKLDNN. ")
 
+            if self.enable_mkldnn_bfloat16:
+                atol = 0.01
             for out, mkldnn_output in zip(outs, mkldnn_outputs):
                 self.assertTrue(
                     np.allclose(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py
new file mode 100644
index 00000000000000..0a4d460d1fbbf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+from paddle.fluid.core import PassVersionChecker
+
+
+class TestMKLDNNCpuBfloat16Pass(InferencePassTest):
+    def setUp(self):
+        self.init_data()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            x = fluid.data(
+                name='x', shape=[-1] + self.shape_x, dtype=self.d_type)
+            y = fluid.data(
+                name='y', shape=[-1] + self.shape_y, dtype=self.d_type)
+            out = fluid.layers.matmul(x, y)
+            out = fluid.layers.transpose(out, perm=[0, 1, 2, 3])
+            out = fluid.layers.reshape(out, [0, 0, 0, 0])
+            out = fluid.layers.fc(out, size=1)
+
+            self.feeds = {
+                "x":
+                np.random.random([self.bs] + self.shape_x).astype(self.d_type),
+                "y":
+                np.random.random([self.bs] + self.shape_y).astype(self.d_type)
+            }
+            self.fetch_list = [out]
+
+    def init_data(self):
+        self.bs = 8
+        self.d_type = np.float32
+        self.shape_x = [12, 10, 1]
+        self.shape_y = [12, 1, 64]
+        self.enable_mkldnn = True
+        self.enable_mkldnn_bfloat16 = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu, flatten=True)
+        self.assertTrue(PassVersionChecker.IsCompatible('cpu_bfloat16_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()

From ece1e4cd9de38375a2b5007fe5e8d69a521d9a7b Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 16 Nov 2020 19:19:39 +0800
Subject: [PATCH 20/56] Add weighted random sampler (#28545)

* add WeightedRandomSampler. test=develop
---
 python/paddle/fluid/dataloader/sampler.py     | 87 +++++++++++++++++-
 .../tests/unittests/test_batch_sampler.py     | 92 +++++++++++++++++--
 python/paddle/io/__init__.py                  |  3 +-
 3 files changed, 171 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
index 5c75fafe8b2238..7207ebcbacfdb0 100644
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -16,8 +16,11 @@
 from __future__ import division
 
 import numpy as np
+from .. import core
 
-__all__ = ["Sampler", "SequenceSampler", "RandomSampler"]
+__all__ = [
+    "Sampler", "SequenceSampler", "RandomSampler", "WeightedRandomSampler"
+]
 
 
 class Sampler(object):
@@ -234,3 +237,85 @@ def __iter__(self):
 
     def __len__(self):
         return self.num_samples
+
+
+def _weighted_sample(weights, num_samples, replacement=True):
+    if isinstance(weights, core.LoDTensor):
+        weights = weights.numpy()
+    if isinstance(weights, (list, tuple)):
+        weights = np.array(weights)
+    assert isinstance(weights, np.ndarray), \
+            "weights should be paddle.Tensor, numpy.ndarray, list or tuple"
+    assert len(weights.shape) <= 2, \
+            "weights should be a 1-D or 2-D array"
+    weights = weights.reshape((-1, weights.shape[-1]))
+    assert np.all(weights >= 0.), \
+            "weights should be positive value"
+    assert not np.any(weights == np.inf), \
+            "weights shoule not be INF"
+    assert not np.any(weights == np.nan), \
+            "weights shoule not be NaN"
+
+    non_zeros = np.sum(weights > 0., axis=1)
+    assert np.all(non_zeros > 0), \
+            "weights should have positive values"
+    if not replacement:
+        assert np.all(non_zeros >= num_samples), \
+            "weights positive value number should not " \
+            "less than num_samples when replacement=False"
+
+    weights = weights / weights.sum(axis=1)
+    rets = []
+    for i in range(weights.shape[0]):
+        ret = np.random.choice(weights.shape[1], num_samples, replacement,
+                               weights[i])
+        rets.append(ret)
+    return np.array(rets)
+
+
+class WeightedRandomSampler(Sampler):
+    """
+    Random sample with given weights (probabilities), sampe index will be in range
+    [0, len(weights) - 1], if :attr:`replacement` is True, index can be sampled
+    multiple times.
+
+    Args:
+        weights(numpy.ndarray|paddle.Tensor|list|tuple): sequence of weights,
+                should be numpy array, paddle.Tensor, list or tuple
+        num_samples(int): set sample number to draw from sampler.
+        replacement(bool): Whether to draw sample with replacements, default True
+        
+    Returns:
+        Sampler: a Sampler yield sample index randomly by given weights
+
+    Examples:
+
+        .. code-block:: python
+            
+            from paddle.io import WeightedRandomSampler
+
+            sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
+                                            num_samples=5,
+                                            replacement=True)
+
+            for index in sampler:
+                print(index)
+    """
+
+    def __init__(self, weights, num_samples, replacement=True):
+        if not isinstance(num_samples, int) or num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer")
+        if not isinstance(replacement, bool):
+            raise ValueError("replacement should be a boolean value")
+        self.weights = weights
+        self.num_samples = num_samples
+        self.replacement = replacement
+
+    def __iter__(self):
+        idxs = _weighted_sample(self.weights, self.num_samples,
+                                self.replacement)
+        return iter(idxs.reshape((-1)).tolist())
+
+    def __len__(self):
+        mul = np.prod(self.weights.shape) // self.weights.shape[-1]
+        return self.num_samples * mul
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 4faef77dad40dd..4c323a2511f5b6 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -16,8 +16,10 @@
 
 import unittest
 
+import numpy as np
 import paddle.fluid as fluid
-from paddle.io import BatchSampler, Dataset, Sampler, SequenceSampler, RandomSampler
+from paddle.io import BatchSampler, Dataset, Sampler, SequenceSampler, \
+                        RandomSampler, WeightedRandomSampler
 from paddle.io import DistributedBatchSampler
 
 
@@ -195,14 +197,86 @@ def test_main(self):
             pass
 
 
-class TestDistributedBatchSamplerWithSampler(TestBatchSampler):
-    def init_batch_sampler(self):
-        dataset = RandomDataset(1000, 10)
-        bs = DistributedBatchSampler(
-            dataset=dataset,
-            batch_size=self.batch_size,
-            drop_last=self.drop_last)
-        return bs
+class TestWeightedRandomSampler(unittest.TestCase):
+    def init_probs(self, total, pos):
+        pos_probs = np.random.random((pos, )).astype('float32')
+        probs = np.zeros((total, )).astype('float32')
+        probs[:pos] = pos_probs
+        np.random.shuffle(probs)
+        return probs
+
+    def test_replacement(self):
+        probs = self.init_probs(20, 10)
+        sampler = WeightedRandomSampler(probs, 30, True)
+        assert len(sampler) == 30
+        for idx in iter(sampler):
+            assert probs[idx] > 0.
+
+    def test_no_replacement(self):
+        probs = self.init_probs(20, 10)
+        sampler = WeightedRandomSampler(probs, 10, False)
+        assert len(sampler) == 10
+        idxs = []
+        for idx in iter(sampler):
+            assert probs[idx] > 0.
+            idxs.append(idx)
+        assert len(set(idxs)) == len(idxs)
+
+    def test_assert(self):
+        # all zeros
+        probs = np.zeros((10, )).astype('float32')
+        sampler = WeightedRandomSampler(probs, 10, True)
+        try:
+            for idx in iter(sampler):
+                pass
+            self.assertTrue(False)
+        except AssertionError:
+            self.assertTrue(True)
+
+        # not enough pos
+        probs = self.init_probs(10, 5)
+        sampler = WeightedRandomSampler(probs, 10, False)
+        try:
+            for idx in iter(sampler):
+                pass
+            self.assertTrue(False)
+        except AssertionError:
+            self.assertTrue(True)
+
+        # neg probs
+        probs = -1.0 * np.ones((10, )).astype('float32')
+        sampler = WeightedRandomSampler(probs, 10, True)
+        try:
+            for idx in iter(sampler):
+                pass
+            self.assertTrue(False)
+        except AssertionError:
+            self.assertTrue(True)
+
+    def test_raise(self):
+        # float num_samples
+        probs = self.init_probs(10, 5)
+        try:
+            sampler = WeightedRandomSampler(probs, 2.3, True)
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
+
+        # neg num_samples
+        probs = self.init_probs(10, 5)
+        try:
+            sampler = WeightedRandomSampler(probs, -1, True)
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
+
+        # no-bool replacement
+        probs = self.init_probs(10, 5)
+        try:
+            sampler = WeightedRandomSampler(probs, 5, 5)
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index b4e437a97dd22b..e8b07528019c51 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -27,9 +27,10 @@
     'Sampler',
     'SequenceSampler',
     'RandomSampler',
+    'WeightedRandomSampler',
 ]
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
         TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
-        ComposeDataset, ChainDataset
+        ComposeDataset, ChainDataset, WeightedRandomSampler

From a972c33fd7b93a24cc199ad4f3ae01ea371d3972 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Mon, 16 Nov 2020 19:33:33 +0800
Subject: [PATCH 21/56] refine gather OP performance for dynamic mode (#28587)

---
 paddle/fluid/operators/gather_op.cc  | 9 +++++++++
 python/paddle/tensor/manipulation.py | 5 ++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 648afe7e8215fe..162766546b3c26 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -93,6 +93,15 @@ class GatherGradOp : public framework::OperatorWithKernel {
                                        ctx, framework::GradVarName("Out")),
                                    ctx.device_context());
   }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "Axis") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
 };
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 4a01f7e7fa311e..adad9cfdc26671 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -785,9 +785,12 @@ def gather(x, index, axis=None, name=None):
     if axis is None:
         axis = 0
     axis_tensor = axis
+    if not isinstance(axis, Variable) and axis == 0:
+        return paddle.fluid.layers.gather(input=x, index=index, overwrite=True)
     if not isinstance(axis, Variable):
         with device_guard("cpu"):
-            axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
+            axis_tensor = fill_constant(
+                shape=[1], dtype='int64', value=axis, force_cpu=True)
     if in_dygraph_mode():
         return core.ops.gather(x, index, axis_tensor)
 

From 8f2656ef5ca4ab16f06d94b8ca9392d3f0f760ae Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Mon, 16 Nov 2020 20:21:46 +0800
Subject: [PATCH 22/56] fix the gradient bug for the topk v2

fix the gradient bug for the topk v2
---
 paddle/fluid/operators/top_k_function_cuda.h  | 12 ++++---
 .../fluid/tests/unittests/test_top_k_v2_op.py | 32 +++++++++++--------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 57891699fd2ad7..0fd5f2ac01df3f 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -335,6 +335,7 @@ __global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
     for (size_t j = 0; j < cols; ++j) {
       x_grad[i * cols + j] = 0;
     }
+    __syncthreads();
     for (size_t j = 0; j < k; ++j) {
       size_t idx = indices[i * k + j];
       x_grad[i * cols + idx] = out_grad[i * k + j];
@@ -349,15 +350,16 @@ __global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
                                    int raw_height, int k) {
   // raw_height is the length of topk axis
   for (int i = blockIdx.x; i < pre; i += gridDim.x) {
-    const int& base_index = i * post * k;
-    const int& base_grad = i * post * raw_height;
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
     for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
       grad_in[base_grad + j] = static_cast<T>(0);
     }
+    __syncthreads();
     for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
-      const int64_t idx_ij = indices[base_index + j];
-      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
-      grad_in[in_ij] = grad_out[idx_ij];
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
     }
   }
 }
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index b9d96f329b5bb4..94dcf151150ff2 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -64,34 +64,38 @@ def test_check_grad(self):
 
 
 class TestTopkOp1(TestTopkOp):
-    def init_args(self):
-        self.k = 3
-        self.axis = 0
-        self.largest = True
-
-
-class TestTopkOp2(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = False
 
 
-class TestTopkOp3(TestTopkOp):
+class TestTopkOp2(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
         self.largest = False
 
 
-class TestTopkOp4(TestTopkOp):
+class TestTopkOp3(OpTest):
     def init_args(self):
-        self.k = 4
-        self.axis = 0
-        self.largest = False
+        self.k = 6
+        self.axis = 1
+        self.largest = True
 
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(16, 100)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
 
-class TestTopkOp5(TestTopkOp):
+
+class TestTopkOp4(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -109,7 +113,7 @@ def setUp(self):
         self.outputs = {'Out': output, 'Indices': indices}
 
 
-class TestTopkOp6(TestTopkOp):
+class TestTopkOp5(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 1

From b2f7ab6636d5e7fcc3bfd655c071416f190ed619 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Mon, 16 Nov 2020 20:58:29 +0800
Subject: [PATCH 23/56] bug fix, test=develop (#28648)

---
 paddle/fluid/operators/collective/recv_v2_op.cu.cc | 2 +-
 paddle/fluid/operators/collective/send_v2_op.cu.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index f0dd8aee23588c..892056f21359dd 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -26,6 +26,7 @@ template <typename T>
 class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
     int rid = ctx.Attr<int>("ring_id");
     PADDLE_ENFORCE_GE(
         rid, 0,
@@ -44,7 +45,6 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     framework::proto::VarType::Type type =
         framework::proto::VarType::Type(data_type);
 
-#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
     cudaStream_t stream = nullptr;
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 9f925b2eede027..4de3f47ccc66b3 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -26,6 +26,7 @@ template <typename T>
 class SendOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
     int numel = x->numel();
 
@@ -42,7 +43,6 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
             "The peer (%d) for send_v2 op must be non-negative.", peer));
     cudaStream_t stream = nullptr;
     auto place = ctx.GetPlace();
-#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);

From d1e84f3e9e46f0776653d014faf318319c56679c Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Mon, 16 Nov 2020 21:18:37 +0800
Subject: [PATCH 24/56] Add some ops for cacluating output scale, test=develop
 (#28644)

---
 .../fluid/contrib/slim/quantization/quantization_pass.py     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index eba881a2637aec..68bf9ecd80be4c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -69,6 +69,10 @@
     "hard_swish",
     "hard_sigmoid",
     "conv2d_transpose",
+    "gru",
+    "bilinear_interp",
+    "nearest_interp",
+    "trilinear_interp",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -114,6 +118,7 @@
     "scale": [["X"], ["Out"]],
     "hard_swish": [["X"], ["Out"]],
     "hard_sigmoid": [["X"], ["Out"]],
+    "gru": [["Input", "Weight"], ["Hidden"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']

From 361a53930f9162bb79af4f0d985350b44e84c762 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Mon, 16 Nov 2020 22:26:00 +0800
Subject: [PATCH 25/56] fix doc of save/load (#28645)

---
 python/paddle/framework/io.py | 2 --
 python/paddle/optimizer/lr.py | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 945c8160b47fbd..d794fce5e378dd 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -228,7 +228,6 @@ def save(obj, path):
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
             paddle.save(layer_state_dict, "emb.pdparams")
-
             scheduler = paddle.optimizer.lr.NoamDecay(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
@@ -320,7 +319,6 @@ def load(path, **configs):
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
             paddle.save(layer_state_dict, "emb.pdparams")
-
             scheduler = paddle.optimizer.lr.NoamDecay(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index e4fb54c229f212..2d5dc5d998e638 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1434,7 +1434,8 @@ class CosineAnnealingDecay(LRScheduler):
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -1460,7 +1461,8 @@ class CosineAnnealingDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self,

From a083c76ab406093a5b6ffa9befd3c7b04c991b23 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 16 Nov 2020 22:26:44 +0800
Subject: [PATCH 26/56] adjust signal failed wait time (#28640)

---
 .../fluid/tests/unittests/test_imperative_signal_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index d783a2cc752d2a..775bf7941aaff9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -49,7 +49,7 @@ def __test_process__():
             test_process.start()
 
             set_child_signal_handler(id(self), test_process.pid)
-            time.sleep(5)
+            time.sleep(10)
         except SystemError as ex:
             self.assertIn("Fatal", cpt.get_exception_message(ex))
             exception = ex
@@ -67,7 +67,7 @@ def __test_process__():
             test_process.start()
 
             set_child_signal_handler(id(self), test_process.pid)
-            time.sleep(3)
+            time.sleep(10)
         except SystemError as ex:
             self.assertIn("Segmentation fault", cpt.get_exception_message(ex))
             exception = ex
@@ -85,7 +85,7 @@ def __test_process__():
             test_process.start()
 
             set_child_signal_handler(id(self), test_process.pid)
-            time.sleep(3)
+            time.sleep(10)
         except SystemError as ex:
             self.assertIn("Bus error", cpt.get_exception_message(ex))
             exception = ex

From 2cd10fc4657da0f29e4e77b6df1b25ec4707a3b2 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Tue, 17 Nov 2020 09:38:13 +0800
Subject: [PATCH 27/56] fix 2.0 api docs (#28445)

---
 python/paddle/fluid/layers/nn.py          |  40 +++-----
 python/paddle/nn/functional/activation.py |  45 ++++-----
 python/paddle/nn/layer/activation.py      |  39 +++----
 python/paddle/tensor/creation.py          |  54 ++++------
 python/paddle/tensor/random.py            | 118 ++++++++++++++--------
 python/paddle/tensor/stat.py              |  28 ++---
 6 files changed, 154 insertions(+), 170 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3ac43df872e377..2feca60430dc04 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9730,15 +9730,13 @@ def swish(x, beta=1.0, name=None):
     return out
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.prelu")
+@deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, name=None):
     """
-    :api_attr: Static Graph
-
-    Equation:
+    prelu activation.
 
     .. math::
-        y = \max(0, x) + \\alpha * \min(0, x)
+        prelu(x) = max(0, x) + \\alpha * min(0, x)
 
     There are three modes for the activation:
 
@@ -9748,34 +9746,28 @@ def prelu(x, mode, param_attr=None, name=None):
         channel: Elements in same channel share same alpha.
         element: All elements do not share alpha. Each element has its own alpha.
 
-    Args:
-        x (Variable): The input Tensor or LoDTensor with data type float32.
+    Parameters:
+        x (Tensor): The input Tensor or LoDTensor with data type float32.
         mode (str): The mode for weight sharing.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-          weight (alpha), it can be create by ParamAttr. None by default.
-          For detailed information, please refer to :ref:`api_fluid_ParamAttr`.
-        name(str|None): For detailed information, please refer
-          to :ref:`api_guide_Name`. Usually name is no need to set and
-          None by default.
+        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable
+            weight (alpha), it can be create by ParamAttr. None by default.
+            For detailed information, please refer to :ref:`api_fluid_ParamAttr`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable:
-
-        output(Variable): The tensor or LoDTensor with the same shape as input.
-        The data type is float32.
+        Tensor: A tensor with the same shape and data type as x.
 
     Examples:
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             import paddle
-            paddle.enable_static()
-            from paddle.fluid.param_attr import ParamAttr
-            x = fluid.data(name="x", shape=[None,5,10,10], dtype="float32")
-            mode = 'channel'
-            output = fluid.layers.prelu(
-                     x,mode,param_attr=ParamAttr(name='alpha'))
+
+            x = paddle.to_tensor([-1., 2., 3.])
+            param = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.2))
+            out = paddle.static.nn.prelu(x, 'all', param)
+            # [-0.2, 2., 3.]
 
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'prelu')
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index fd86c2e9fa760d..e7adc7106a4f09 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -79,9 +79,8 @@ def elu(x, alpha=1.0, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            x = paddle.to_tensor([[-1., 6.], [1., 15.6]])
             out = F.elu(x, alpha=0.2)
             # [[-0.12642411  6.        ]
             #  [ 1.          15.6      ]]
@@ -131,11 +130,14 @@ def gelu(x, approximate=False, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
-            out1 = F.gelu(x) # [-0.158655 0.345731 0.841345 1.39979]
-            out2 = F.gelu(x, True) # [-0.158808 0.345714 0.841192 1.39957]
+            x = paddle.to_tensor([[-1, 0.5], [1, 1.5]])
+            out1 = F.gelu(x)
+            # [[-0.15865529,  0.34573123],
+            #  [ 0.84134471,  1.39978933]]
+            out2 = F.gelu(x, True)
+            # [[-0.15880799,  0.34571400],
+            #  [ 0.84119201,  1.39957154]]
     """
 
     if in_dygraph_mode():
@@ -181,11 +183,8 @@ def hardshrink(x, threshold=0.5, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
-
-            paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            x = paddle.to_tensor([-1, 0.3, 2.5])
             out = F.hardshrink(x) # [-1., 0., 2.5]
 
     """
@@ -385,11 +384,8 @@ def leaky_relu(x, negative_slope=0.01, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
-
-            paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            x = paddle.to_tensor([-2., 0., 1.])
             out = F.leaky_relu(x) # [-0.02, 0., 1.]
 
     """
@@ -1147,8 +1143,10 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        log\\_softmax[i, j] = log(softmax(x))
-                            = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
+        \\begin{aligned} 
+        log\\_softmax[i, j] &= log(softmax(x)) \\\\
+        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
+        \\end{aligned}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1174,16 +1172,13 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
-
-            paddle.disable_static()
 
-            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                            [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = [[[-2.0, 3.0, -4.0, 5.0],
+                  [3.0, -4.0, 5.0, -6.0],
+                  [-7.0, -8.0, 8.0, 9.0]],
+                 [[1.0, -2.0, -3.0, 4.0],
+                  [-5.0, 6.0, 7.0, -8.0],
+                  [6.0, 7.0, 8.0, 9.0]]]
             x = paddle.to_tensor(x)
             out1 = F.log_softmax(x)
             out2 = F.log_softmax(x, dtype='float64')
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 32979bae34d803..520762107db07e 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -70,9 +70,8 @@ class ELU(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
             m = paddle.nn.ELU(0.2)
             out = m(x)
             # [[-0.12642411  6.        ]
@@ -166,11 +165,8 @@ class Hardshrink(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-
-            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            x = paddle.to_tensor([-1, 0.3, 2.5])
             m = paddle.nn.Hardshrink()
             out = m(x) # [-1., 0., 2.5]
     """
@@ -293,11 +289,10 @@ class Hardtanh(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            x = paddle.to_tensor([-1.5, 0.3, 2.5])
             m = paddle.nn.Hardtanh()
-            out = m(x) # # [-1., 0.3, 1.]
+            out = m(x) # [-1., 0.3, 1.]
     """
 
     def __init__(self, min=-1.0, max=1.0, name=None):
@@ -397,9 +392,8 @@ class ReLU(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            x = paddle.to_tensor([-2., 0., 1.])
             m = paddle.nn.ReLU()
             out = m(x) # [0., 0., 1.]
     """
@@ -613,7 +607,7 @@ class Hardsigmoid(layers.Layer):
 
           import paddle
 
-          m = paddle.nn.Sigmoid()
+          m = paddle.nn.Hardsigmoid()
           x = paddle.to_tensor([-4., 5., 1.])
           out = m(x) # [0., 1, 0.666667]
     """
@@ -1016,8 +1010,10 @@ class LogSoftmax(layers.Layer):
 
     .. math::
 
-        Out[i, j] = log(softmax(x))
-                  = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
+        \\begin{aligned} 
+        Out[i, j] &= log(softmax(x)) \\\\
+        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
+        \\end{aligned}
 
     Parameters:
         axis (int, optional): The axis along which to perform log_softmax
@@ -1035,16 +1031,13 @@ class LogSoftmax(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            paddle.disable_static()
 
-            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                           [3.0, -4.0, 5.0, -6.0],
-                           [-7.0, -8.0, 8.0, 9.0]],
-                          [[1.0, -2.0, -3.0, 4.0],
-                           [-5.0, 6.0, 7.0, -8.0],
-                           [6.0, 7.0, 8.0, 9.0]]])
+            x = [[[-2.0, 3.0, -4.0, 5.0],
+                  [3.0, -4.0, 5.0, -6.0],
+                  [-7.0, -8.0, 8.0, 9.0]],
+                 [[1.0, -2.0, -3.0, 4.0],
+                  [-5.0, 6.0, 7.0, -8.0],
+                  [6.0, 7.0, 8.0, 9.0]]]
             m = paddle.nn.LogSoftmax()
             x = paddle.to_tensor(x)
             out = m(x)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index a69bc64c4cf669..622ae3c584ef04 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -300,9 +300,6 @@ def ones(shape, dtype=None, name=None):
 
 def ones_like(x, dtype=None, name=None):
     """
-	:alias_main: paddle.ones_like
-	:alias: paddle.tensor.ones_like, paddle.tensor.creation.ones_like
-
     This OP returns a Tensor filled with the value 1, with the same shape and
     data type (use ``dtype`` if ``dtype`` is not None) as ``x``.
 
@@ -323,18 +320,16 @@ def ones_like(x, dtype=None, name=None):
 
     Raise:
         TypeError: If ``dtype`` is not None and is not bool, float16, float32,
-            float64, int32 or int64.
+        float64, int32 or int64.
 
     Examples:
         .. code-block:: python
 
             import paddle
 
-            paddle.disable_static()
-
             x = paddle.to_tensor([1,2,3])
-            out1 = paddle.zeros_like(x) # [1., 1., 1.]
-            out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
+            out1 = paddle.ones_like(x) # [1., 1., 1.]
+            out2 = paddle.ones_like(x, dtype='int32') # [1, 1, 1]
 
     """
     return full_like(x=x, fill_value=1, dtype=dtype, name=name)
@@ -380,9 +375,6 @@ def zeros(shape, dtype=None, name=None):
 
 def zeros_like(x, dtype=None, name=None):
     """
-	:alias_main: paddle.zeros_like
-	:alias: paddle.tensor.zeros_like, paddle.tensor.creation.zeros_like
-
     This OP returns a Tensor filled with the value 0, with the same shape and
     data type (use ``dtype`` if ``dtype`` is not None) as ``x``.
 
@@ -403,16 +395,14 @@ def zeros_like(x, dtype=None, name=None):
 
     Raise:
         TypeError: If ``dtype`` is not None and is not bool, float16, float32,
-            float64, int32 or int64.
+        float64, int32 or int64.
 
     Examples:
         .. code-block:: python
 
             import paddle
 
-            paddle.disable_static()
-
-            x = paddle.to_tensor([1,2,3])
+            x = paddle.to_tensor([1, 2, 3])
             out1 = paddle.zeros_like(x) # [0., 0., 0.]
             out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
@@ -519,9 +509,6 @@ def full(shape, fill_value, dtype=None, name=None):
 
 def arange(start=0, end=None, step=1, dtype=None, name=None):
     """
-	:alias_main: paddle.arange
-	:alias: paddle.tensor.arange, paddle.tensor.creation.arange
-
     This OP returns a 1-D Tensor with spaced values within a given interval.
 
     Values are generated into the half-open interval [``start``, ``end``) with
@@ -552,33 +539,30 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
 
     Returns: 
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
-            taken with common difference ``step`` beginning from ``start``. Its
-            data type is set by ``dtype``.
+        taken with common difference ``step`` beginning from ``start``. Its
+        data type is set by ``dtype``.
 
     Raises:
         TypeError: If ``dtype`` is not int32, int64, float32, float64.
 
-    examples:
-
+    Examples:
         .. code-block:: python
 
-        import paddle
-
-        paddle.disable_static()
+            import paddle
 
-        out1 = paddle.arange(5)
-        # [0, 1, 2, 3, 4]
+            out1 = paddle.arange(5)
+            # [0, 1, 2, 3, 4]
 
-        out2 = paddle.arange(3, 9, 2.0)
-        # [3, 5, 7]
+            out2 = paddle.arange(3, 9, 2.0)
+            # [3, 5, 7]
 
-        # use 4.999 instead of 5.0 to avoid floating point rounding errors
-        out3 = paddle.arange(4.999, dtype='float32')
-        # [0., 1., 2., 3., 4.]
+            # use 4.999 instead of 5.0 to avoid floating point rounding errors
+            out3 = paddle.arange(4.999, dtype='float32')
+            # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.to_tensor([3])
-        out4 = paddle.arange(start_var, 7)
-        # [3, 4, 5, 6]
+            start_var = paddle.to_tensor([3])
+            out4 = paddle.arange(start_var, 7)
+            # [3, 4, 5, 6]
              
     """
     if dtype is None:
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 7e4d3d7bf9279b..934008dc969f16 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -252,16 +252,14 @@ def standard_normal(shape, dtype=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             # example 1: attr shape is a list which doesn't contain Tensor.
             out1 = paddle.standard_normal(shape=[2, 3])
             # [[-2.923464  ,  0.11934398, -0.51249987],  # random
             #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim1 = paddle.full([1], 2, "int64")
-            dim2 = paddle.full([1], 3, "int32")
+            dim1 = paddle.to_tensor([2], 'int64')
+            dim2 = paddle.to_tensor([3], 'int32')
             out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
             # [[[-2.8852394 , -0.25898588],  # random
             #   [-0.47420555,  0.17683524],  # random
@@ -272,8 +270,7 @@ def standard_normal(shape, dtype=None, name=None):
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
             shape_tensor = paddle.to_tensor([2, 3])
-            result_3 = paddle.standard_normal(shape_tensor)
-
+            out3 = paddle.standard_normal(shape_tensor)
             # [[-2.878077 ,  0.17099959,  0.05111201]  # random
             #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
 
@@ -281,7 +278,58 @@ def standard_normal(shape, dtype=None, name=None):
     return gaussian(shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
 
 
-randn = standard_normal
+def randn(shape, dtype=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a standard
+    normal distribution with mean 0 and standard deviation 1, with ``shape``
+    and ``dtype``.
+
+    Args:
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a standard
+        normal distribution with mean 0 and standard deviation 1, with
+        ``shape`` and ``dtype``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            out1 = paddle.randn(shape=[2, 3])
+            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
+            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim1 = paddle.to_tensor([2], 'int64')
+            dim2 = paddle.to_tensor([3], 'int32')
+            out2 = paddle.randn(shape=[dim1, dim2, 2])
+            # [[[-2.8852394 , -0.25898588],  # random
+            #   [-0.47420555,  0.17683524],  # random
+            #   [-0.7989969 ,  0.00754541]],  # random
+            #  [[ 0.85201347,  0.32320443],  # random
+            #   [ 1.1399018 ,  0.48336947],  # random
+            #   [ 0.8086993 ,  0.6868893 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            shape_tensor = paddle.to_tensor([2, 3])
+            out3 = paddle.randn(shape_tensor)
+            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
+            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+    """
+    return standard_normal(shape, dtype, name)
 
 
 def normal(mean=0.0, std=1.0, shape=None, name=None):
@@ -322,8 +370,6 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             out1 = paddle.normal(shape=[2, 3])
             # [[ 0.17501129  0.32364586  1.561118  ]  # random
             #  [-1.7232178   1.1545963  -0.76156676]]  # random
@@ -381,7 +427,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
     Examples:
 
-    ::
+    .. code-block:: text
 
         Input:
           shape = [1, 2]
@@ -423,33 +469,27 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
             
             import paddle
 
-            paddle.disable_static()
-
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.tensor.random.uniform(shape=[3, 4])
-            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357],
-            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249],
-            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]]
+            out1 = paddle.uniform(shape=[3, 4])
+            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357], # random
+            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249], # random
+            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]] # random
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim_1 = paddle.full([1], 2, "int64")
-            dim_2 = paddle.full([1], 3, "int32")
-            result_2 = paddle.tensor.random.uniform(shape=[dim_1, dim_2])
-            # [[-0.9951253,   0.30757582, 0.9899647 ],
-            #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
+            dim1 = paddle.to_tensor([2], 'int64')
+            dim2 = paddle.to_tensor([3], 'int32')
+            out2 = paddle.uniform(shape=[dim1, dim2])
+            # [[-0.9951253,   0.30757582, 0.9899647 ], # random
+            #  [ 0.5864527,   0.6607096,  -0.8886161]] # random
 
             # example 3:
             # attr shape is a Tensor, the data type must be int64 or int32.
             shape_tensor = paddle.to_tensor([2, 3])
-            result_3 = paddle.tensor.random.uniform(shape_tensor)
-            # if shape_tensor's value is [2, 3]
-            # result_3 is:
-            # [[-0.8517412,  -0.4006908,   0.2551912 ],
-            #  [ 0.3364414,   0.36278176, -0.16085452]]
-
-
+            out3 = paddle.uniform(shape_tensor)
+            # [[-0.8517412,  -0.4006908,   0.2551912 ], # random
+            #  [ 0.3364414,   0.36278176, -0.16085452]] # random
     """
     if dtype is None:
         dtype = paddle.framework.get_default_dtype()
@@ -517,8 +557,6 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
             out1 = paddle.randint(low=-5, high=5, shape=[3])
@@ -526,18 +564,16 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim1 = paddle.full([1], 2, "int64")
-            dim2 = paddle.full([1], 3, "int32")
-            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2], dtype="int32")
+            dim1 = paddle.to_tensor([2], 'int64')
+            dim2 = paddle.to_tensor([3], 'int32')
+            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2])
             # [[0, -1, -3],  # random
             #  [4, -2,  0]]  # random
 
             # example 3:
             # attr shape is a Tensor
-
             shape_tensor = paddle.to_tensor(3)
-            result_3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
-
+            out3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
             # [-2, 2, 3]  # random
 
             # example 4:
@@ -611,8 +647,6 @@ def randperm(n, dtype="int64", name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             out1 = paddle.randperm(5)
             # [4, 1, 2, 3, 0]  # random
 
@@ -668,15 +702,14 @@ def rand(shape, dtype=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
             # example 1: attr shape is a list which doesn't contain Tensor.
             out1 = paddle.rand(shape=[2, 3])
             # [[0.451152  , 0.55825245, 0.403311  ],  # random
             #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim1 = paddle.full([1], 2, "int64")
-            dim2 = paddle.full([1], 3, "int32")
+            dim1 = paddle.to_tensor([2], 'int64')
+            dim2 = paddle.to_tensor([3], 'int32')
             out2 = paddle.rand(shape=[dim1, dim2, 2])
             # [[[0.8879919 , 0.25788337],  # random
             #   [0.28826773, 0.9712097 ],  # random
@@ -687,8 +720,7 @@ def rand(shape, dtype=None, name=None):
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
             shape_tensor = paddle.to_tensor([2, 3])
-            result_3 = paddle.rand(shape_tensor)
-
+            out3 = paddle.rand(shape_tensor)
             # [[0.22920267, 0.841956  , 0.05981819],  # random
             #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 3873d893bd7c34..9e565d4e5223cd 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -56,17 +56,13 @@ def mean(x, axis=None, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-
-            x = np.array([[[1, 2, 3, 4],
-                           [5, 6, 7, 8],
-                           [9, 10, 11, 12]],
-                          [[13, 14, 15, 16],
-                           [17, 18, 19, 20],
-                           [21, 22, 23, 24]]], 'float32')
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[[1., 2., 3., 4.],
+                                   [5., 6., 7., 8.],
+                                   [9., 10., 11., 12.]],
+                                  [[13., 14., 15., 16.],
+                                   [17., 18., 19., 20.],
+                                   [21., 22., 23., 24.]]])
             out1 = paddle.mean(x)
             # [12.5]
             out2 = paddle.mean(x, axis=-1)
@@ -145,12 +141,8 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
-            paddle.disable_static()
 
-            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
             out1 = paddle.var(x)
             # [2.66666667]
             out2 = paddle.var(x, axis=1)
@@ -208,12 +200,8 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
-            paddle.disable_static()
 
-            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
             out1 = paddle.std(x)
             # [1.63299316]
             out2 = paddle.std(x, axis=1)

From 65aac81191c00dcbe79cd191f595be942a8bb749 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Tue, 17 Nov 2020 10:07:30 +0800
Subject: [PATCH 28/56] Fix fake_quant error when cout > 1024, test=develop
 (#28603)

---
 paddle/fluid/operators/fake_dequantize_op.cu | 16 ++++-----
 paddle/fluid/operators/fake_quantize_op.cu   | 34 ++++++++++++++------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 54a92b055a39d4..a89c430c7ab24e 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -62,14 +62,14 @@ __global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
                                              T max_range, const int num,
                                              const int cin, const int cout,
                                              T* out) {
-  int cout_wh_size = num / cin;
-  int wh_size = cout_wh_size / cout;
+  int bid = blockIdx.x;
+  T s = scale[bid % cout];
 
-  T s = scale[blockIdx.x];
-  const T* in_current = in + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
-  T* out_current = out + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
+  int wh_size = num / (cin * cout);
+  const T* in_current = in + bid * wh_size;
+  T* out_current = out + bid * wh_size;
 
-  for (int i = 0; i < wh_size; i++) {
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
     out_current[i] = in_current[i] * s / max_range;
   }
 }
@@ -107,8 +107,8 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
             in_data, scale_factor, max_range, num, in_dims[0], out_data);
       } else if (quant_axis == 1) {
         // Dequantize weight of Cin * Cout * W * H
-        int grid = in_dims[1];
-        int block = in_dims[0];
+        int grid = in_dims[0] * in_dims[1];
+        int block = 1024;
         DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
             in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
             out_data);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 8bc14dde863682..26dcf8bf39cf28 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -131,7 +131,7 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
     }
     __syncthreads();
   }
-  if (tid == 0) {
+  if (tid == 0 && shared_max_data[0] > out[bid]) {
     out[bid] = shared_max_data[0];
   }
 }
@@ -148,20 +148,36 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
                                           quant_axis));
     const int num = in_tensor.numel();
     auto in_dims = in_tensor.dims();
-    int channel = in_dims[quant_axis];
     const T* in_data = in_tensor.data<T>();
     if (quant_axis == 0) {
-      int grid = channel;
+      int cout = in_dims[0];
+      int grid = cout;
       int block = 1024;
       FindChannelAbsMaxKernelQuantAxis0<
           T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-          in_data, num, channel, out_abs_max);
+          in_data, num, cout, out_abs_max);
     } else if (quant_axis == 1) {
-      int grid = in_dims[1];
-      int block = in_dims[0];
-      FindChannelAbsMaxKernelQuantAxis1<
-          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-          in_data, num, in_dims[0], in_dims[1], out_abs_max);
+      int cin = in_dims[0];
+      int cout = in_dims[1];
+      int grid = cout;
+      int max_threads = 1024;
+
+      cudaMemset(out_abs_max, 0, sizeof(T) * cout);
+
+      for (int i = 0; i < cin / max_threads; i++) {
+        int block = max_threads;
+        FindChannelAbsMaxKernelQuantAxis1<
+            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+            in_data, num, cin, cout, out_abs_max);
+        in_data += num / cin;
+      }
+
+      int block = cin % max_threads;
+      if (block > 0) {
+        FindChannelAbsMaxKernelQuantAxis1<
+            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+            in_data, num, in_dims[0], in_dims[1], out_abs_max);
+      }
     }
   }
 };

From 68ee7f731250664077be675aac49b358ee93ed78 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 17 Nov 2020 10:15:04 +0800
Subject: [PATCH 29/56] fix overwrite for gather OP of API2.0(#28659)

---
 python/paddle/tensor/manipulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index adad9cfdc26671..060f9a1a919041 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -786,7 +786,7 @@ def gather(x, index, axis=None, name=None):
         axis = 0
     axis_tensor = axis
     if not isinstance(axis, Variable) and axis == 0:
-        return paddle.fluid.layers.gather(input=x, index=index, overwrite=True)
+        return paddle.fluid.layers.gather(input=x, index=index, overwrite=False)
     if not isinstance(axis, Variable):
         with device_guard("cpu"):
             axis_tensor = fill_constant(

From 57dab959ca53a42ad1bccd8fd0344f32e2074a13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 17 Nov 2020 10:30:49 +0800
Subject: [PATCH 30/56] add datanorm op new scale_w register (#28657)

Co-authored-by: yaoxuefeng6 <yaoxuefeng@baidu.com>
---
 paddle/fluid/operators/data_norm_op.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 5df2bbdf95144d..45e77a99e6b3eb 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -755,3 +756,10 @@ REGISTER_OP_CPU_KERNEL(
     data_norm_grad,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_VERSION(data_norm)
+    .AddCheckpoint(
+        R"ROC(
+              upgrad data_norm op by adding scale_w to support scale and shift.)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "scale_w",
+            "scale_w is used to do scale duirng data_norm like batchnorm "));

From 8040fa2bca72224c66ba6700dcc7e8ae79ea0554 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 17 Nov 2020 11:43:29 +0800
Subject: [PATCH 31/56] Fix output dtype inconsistent with input (#28649)

* fix output dtyp inconsistent with input

* refine code
---
 python/paddle/fluid/tests/unittests/test_gather_op.py | 9 +++++++++
 python/paddle/tensor/manipulation.py                  | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 2e4b52c282d567..946027a22f8838 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -19,6 +19,7 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+from paddle.framework import core
 
 
 def gather_numpy(x, index, axis):
@@ -298,5 +299,13 @@ def test_index_type():
             self.assertRaises(TypeError, test_index_type)
 
 
+class TestCheckOutType(unittest.TestCase):
+    def test_out_type(self):
+        data = paddle.static.data(shape=[16, 10], dtype='int64', name='x')
+        index = paddle.static.data(shape=[4], dtype='int64', name='index')
+        out = paddle.gather(data, index)
+        self.assertTrue(out.dtype == core.VarDesc.VarType.INT64)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 060f9a1a919041..bdda90315ac9c7 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -804,7 +804,7 @@ def gather(x, index, axis=None, name=None):
         check_type(axis, 'axis', (int), 'gather')
 
     helper = LayerHelper('gather', **locals())
-    dtype = helper.input_dtype()
+    dtype = helper.input_dtype('x')
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",

From d71c3463b04c345520be6e14736c674c030d2d06 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Tue, 17 Nov 2020 13:20:30 +0800
Subject: [PATCH 32/56] fix pool exclusive and delete disable_static (#28655)

* fix pool exclusive and delete disable_static, test=develop

* fix pool1d  exclusive, test=develop
---
 python/paddle/nn/functional/pooling.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 40166f4d36e94e..829056f5767d7c 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -200,7 +200,6 @@ def avg_pool1d(x,
         .. code-block:: python
           import paddle
           import paddle.nn.functional as F
-          paddle.disable_static()
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
           out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
           # out shape: [1, 3, 16]
@@ -253,7 +252,7 @@ def avg_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not exclusive,
+            "exclusive": exclusive,
             "data_format": data_format,
         })
 
@@ -314,7 +313,6 @@ def avg_pool2d(x,
           import paddle
           import paddle.nn.functional as F
           import numpy as np
-          paddle.disable_static()
           # avg pool2d
           x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
           out = F.avg_pool2d(x,
@@ -365,7 +363,7 @@ def avg_pool2d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not exclusive,
+            "exclusive": exclusive,
             "data_format": data_format,
         })
 
@@ -481,7 +479,7 @@ def avg_pool3d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not exclusive,
+            "exclusive": exclusive,
             "data_format": data_format,
         })
 
@@ -538,7 +536,6 @@ def max_pool1d(x,
         .. code-block:: python
           import paddle
           import paddle.nn.functional as F
-          paddle.disable_static()
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
           pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
           # pool_out shape: [1, 3, 16]
@@ -661,7 +658,6 @@ def max_pool2d(x,
           import paddle
           import paddle.nn.functional as F
           import numpy as np
-          paddle.disable_static()
           # max pool2d
           x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
           out = F.max_pool2d(x,
@@ -791,7 +787,7 @@ def max_pool3d(x,
           import paddle
           import paddle.nn.functional as F
           import numpy as np
-          paddle.disable_static()
+
           # max pool3d
           x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
           output = F.max_pool2d(x,
@@ -905,7 +901,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              paddle.disable_static()
+
               data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
               pool_out = F.adaptive_average_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
@@ -982,7 +978,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             #
             import paddle
             import numpy as np
-            paddle.disable_static()
+
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 32, 32]
@@ -1086,7 +1082,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
             import paddle
             import numpy as np
-            paddle.disable_static()
+
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 8, 32, 32]
@@ -1186,7 +1182,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              paddle.disable_static()
+
               data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
               pool_out = F.adaptive_max_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
@@ -1266,7 +1262,7 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
               #
               import paddle
               import numpy as np
-              paddle.disable_static()
+
               input_data = np.random.rand(2, 3, 32, 32)
               x = paddle.to_tensor(input_data)
               # x.shape is [2, 3, 32, 32]
@@ -1356,7 +1352,7 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
               #
               import paddle
               import numpy as np
-              paddle.disable_static()
+
               input_data = np.random.rand(2, 3, 8, 32, 32)
               x = paddle.to_tensor(input_data)
               # x.shape is [2, 3, 8, 32, 32]

From bf143652ac104f3b4ad3a5084bb91c4681a16140 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 17 Nov 2020 13:38:16 +0800
Subject: [PATCH 33/56] fix lstm OP compile error on windows (#28667)

* add unittest and check unittest for windows

* fix lstm OP compile error on windows
---
 CMakeLists.txt                        | 11 +----------
 paddle/fluid/operators/CMakeLists.txt |  3 ++-
 paddle/scripts/paddle_build.bat       |  7 ++++++-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2faa0a2bbbcb3f..12f5b6f8bd8976 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ if(WIN32)
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        set(${flag_var} "${${flag_var}} /MP /bigobj")
+        set(${flag_var} "${${flag_var}} /MP")
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
@@ -96,15 +96,6 @@ if(WIN32)
         endif()
     endforeach(flag_var)
 
-    foreach(flag_var 
-        CMAKE_STATIC_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS 
-        CMAKE_EXE_LINKER_FLAGS)
-        set(${flag_var} "${${flag_var}} /IGNORE:4006 /IGNORE:4098 /ignore:4049 /IGNORE:4217 /IGNORE:4221")
-        if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES "/INCREMENTAL:NO")
-            string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
-        endif()
-    endforeach(flag_var)
-
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 else(WIN32)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ca80ada7b6ea78..3b9d3e7e9374e6 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_COVERAGE OR WIN32 OR WITH_NV_JETSON)
     SET(OP_MKL_DEPS ${OP_MKL_DEPS} pyramid_hash_op)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op
+register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op
     sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 if (WITH_GPU)
@@ -79,6 +79,7 @@ if (WITH_GPU)
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
+op_library(lstm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} lstm_compute)
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 450cb7546fd4c3..ff5562a25096fd 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -265,7 +265,12 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
+if "%WITH_GPU%"=="OFF" (
+    msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
+) else (
+    msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
+)
+
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 1 (

From 82f0b5ea5c6627c6d60120c65db945d690c4f450 Mon Sep 17 00:00:00 2001
From: littletomatodonkey <2120160898@bit.edu.cn>
Date: Tue, 17 Nov 2020 13:57:32 +0800
Subject: [PATCH 34/56] adapt pad const (#28585)

* adapt pad const

* fix comment and rm fluid import

* rm stdout

* fix note
---
 .../fluid/tests/unittests/test_pad3d_op.py    | 31 ++++++++++++++++++-
 python/paddle/nn/functional/common.py         |  6 ++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
index c29352bb51af68..88d3d80a14c78a 100644
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -251,7 +251,9 @@ def _get_numpy_out(self,
                        mode,
                        value=0,
                        data_format="NCDHW"):
-        if data_format == "NCDHW":
+        if mode == "constant" and len(pad) == len(input_data.shape) * 2:
+            pad = np.reshape(pad, (-1, 2)).tolist()
+        elif data_format == "NCDHW":
             pad = [
                 (0, 0),
                 (0, 0),
@@ -316,6 +318,7 @@ def test_dygraph_1(self):
         paddle.disable_static()
         input_shape = (1, 2, 3, 4, 5)
         pad = [1, 2, 1, 1, 3, 4]
+        pad_3 = [1, 2, 1, 1, 3, 4, 5, 6, 7, 8]
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
@@ -323,6 +326,8 @@ def test_dygraph_1(self):
             input_data, pad, mode, value, data_format="NCDHW")
         np_out2 = self._get_numpy_out(
             input_data, pad, mode, value, data_format="NDHWC")
+        np_out3 = self._get_numpy_out(
+            input_data, pad_3, mode, value, data_format="NCDHW")
         tensor_data = paddle.to_tensor(input_data)
 
         y1 = F.pad(tensor_data,
@@ -335,14 +340,21 @@ def test_dygraph_1(self):
                    mode=mode,
                    value=value,
                    data_format="NDHWC")
+        y3 = F.pad(tensor_data,
+                   pad=pad_3,
+                   mode=mode,
+                   value=value,
+                   data_format="NCDHW")
 
         self.assertTrue(np.allclose(y1.numpy(), np_out1))
         self.assertTrue(np.allclose(y2.numpy(), np_out2))
+        self.assertTrue(np.allclose(y3.numpy(), np_out3))
 
     def test_dygraph_2(self):
         paddle.disable_static()
         input_shape = (2, 3, 4, 5)
         pad = [1, 1, 3, 4]
+        pad_3 = [1, 2, 1, 1, 3, 4, 5, 6]
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
@@ -350,6 +362,8 @@ def test_dygraph_2(self):
             input_data, pad, mode, value, data_format="NCHW")
         np_out2 = self._get_numpy_out(
             input_data, pad, mode, value, data_format="NHWC")
+        np_out3 = self._get_numpy_out(
+            input_data, pad_3, mode, value, data_format="NCHW")
 
         tensor_data = paddle.to_tensor(input_data)
         tensor_pad = paddle.to_tensor(pad, dtype="int32")
@@ -364,14 +378,21 @@ def test_dygraph_2(self):
                    mode=mode,
                    value=value,
                    data_format="NHWC")
+        y3 = F.pad(tensor_data,
+                   pad=pad_3,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
 
         self.assertTrue(np.allclose(y1.numpy(), np_out1))
         self.assertTrue(np.allclose(y2.numpy(), np_out2))
+        self.assertTrue(np.allclose(y3.numpy(), np_out3))
 
     def test_dygraph_3(self):
         paddle.disable_static()
         input_shape = (3, 4, 5)
         pad = [3, 4]
+        pad_3 = [3, 4, 5, 6, 7, 8]
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
@@ -379,6 +400,8 @@ def test_dygraph_3(self):
             input_data, pad, mode, value, data_format="NCL")
         np_out2 = self._get_numpy_out(
             input_data, pad, mode, value, data_format="NLC")
+        np_out3 = self._get_numpy_out(
+            input_data, pad_3, mode, value, data_format="NCL")
         tensor_data = paddle.to_tensor(input_data)
         tensor_pad = paddle.to_tensor(pad, dtype="int32")
 
@@ -392,9 +415,15 @@ def test_dygraph_3(self):
                    mode=mode,
                    value=value,
                    data_format="NLC")
+        y3 = F.pad(tensor_data,
+                   pad=pad_3,
+                   mode=mode,
+                   value=value,
+                   data_format="NCL")
 
         self.assertTrue(np.allclose(y1.numpy(), np_out1))
         self.assertTrue(np.allclose(y2.numpy(), np_out2))
+        self.assertTrue(np.allclose(y3.numpy(), np_out3))
 
 
 class TestPad1dAPI(unittest.TestCase):
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 1cf3599e846b95..5c5e3f37916da1 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1158,6 +1158,9 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
 def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
     """
     Pad tensor according to 'pad' and 'mode'.
+    If mode is 'constant' and length of pad is twice as length of x dimension,
+    then the padding will be started from the first dimension and moved back onto x
+    according to 'pad' and 'value'.
     If mode is 'reflect', pad[0] and pad[1] must be no greater
     than width-1. The height and depth dimension has the same condition.
 
@@ -1273,6 +1276,9 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     unsqueezed_dim = []
 
+    if mode == "constant" and isinstance(pad, list) and len(pad) == x_dim * 2:
+        return layers.pad(x, pad, pad_value=value)
+
     if isinstance(pad, Variable):
         if data_format in ["NCL", "NCHW", "NCDHW"]:
             data_format = "NCDHW"

From 912a5c30b44bd6d254093ee815c0596dbb94e1cd Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 17 Nov 2020 14:36:52 +0800
Subject: [PATCH 35/56] fix the matmul_v2 test for cuda11 (#28635)

---
 python/paddle/fluid/tests/unittests/test_matmul_v2_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 640771df23b726..a6667db6227f98 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -148,8 +148,8 @@ class TestMatMuklOp6(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 2, 100, 1)
-        self.y_shape = (100, )
+        self.x_shape = (1, 2, 102, 1)
+        self.y_shape = (102, )
         self.trans_x = True
         self.trans_y = False
 

From 80d2024644da4a02387c6542e508b8369d7e2efc Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Tue, 17 Nov 2020 16:30:26 +0800
Subject: [PATCH 36/56] bug fix, test=develop (#28674)

---
 paddle/fluid/platform/dynload/nccl.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 2c40c48ee08497..cfc98561e87e9c 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,6 +25,14 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+#if NCCL_VERSION_CODE >= 2212
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

From cdc4e6620d8eb91a98c3aa5b440c369a18752f8f Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Tue, 17 Nov 2020 17:24:38 +0800
Subject: [PATCH 37/56] fix lenet num classes (#28642)

---
 python/paddle/vision/models/lenet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index 119be85db54b90..2fb50fc17b9e9f 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -49,7 +49,8 @@ def __init__(self, num_classes=10):
 
         if num_classes > 0:
             self.fc = nn.Sequential(
-                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
+                nn.Linear(400, 120),
+                nn.Linear(120, 84), nn.Linear(84, num_classes))
 
     def forward(self, inputs):
         x = self.features(inputs)

From 6d8d3d4c22ba0bbed57912ca831a26e5340d1c92 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 17 Nov 2020 11:59:10 +0100
Subject: [PATCH 38/56] [oneDNN] Layer norm bf16 kernel (#28619)

---
 .../framework/ir/graph_pattern_detector.cc    |   4 +-
 paddle/fluid/operators/layer_norm_op.cc       |  35 ++++
 .../operators/mkldnn/layer_norm_mkldnn_op.cc  | 177 ++++++++++++++++++
 paddle/fluid/platform/mkldnn_reuse.h          |   6 +
 .../mkldnn/test_layer_norm_bf16_mkldnn_op.py  | 146 +++++++++++++++
 .../mkldnn/test_layer_norm_mkldnn_op.py       | 151 +++++++++++++++
 .../mkldnn/test_sum_bf16_mkldnn_op.py         |   2 +-
 .../tests/unittests/test_layer_norm_op.py     |  11 +-
 tools/static_mode_white_list.py               |   2 +
 9 files changed, 528 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 5704dd09cf287e..5546a0e372603e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2102,8 +2102,8 @@ PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>({"concat", "conv2d", "fusion_gru", "gelu",
-                                       "reshape2", "softmax", "sum",
-                                       "transpose2"});
+                                       "layer_norm", "reshape2", "softmax",
+                                       "sum", "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 89d8b57505da24..79e3d3b90a93ae 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/layer_norm_op.h"
 #include <memory>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -91,6 +95,25 @@ class LayerNormOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Variance", {left});
     ctx->ShareLoD("X", "Y");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
+        layout, library);
+  }
 };
 
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -134,6 +157,18 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "greater than zero. But received [%d].",
                                 begin_norm_axis));
         });
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Assume feature vectors exist on dimensions
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
new file mode 100644
index 00000000000000..22261e948aa7b6
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -0,0 +1,177 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class LayerNormMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::layer_normalization_forward> {
+ public:
+  LayerNormMKLDNNHandler(const std::vector<int64_t>& dims, const float& epsilon,
+                         const dnnl::normalization_flags& flags,
+                         const bool& is_test, const MKLDNNMemoryFormat fmt,
+                         const platform::MKLDNNDeviceContext& dev_ctx,
+                         platform::Place cpu_place,
+                         const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::layer_normalization_forward>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(dims, uniq_name)) {
+    if (!this->isCached()) {
+      auto md = dnnl::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
+      if (!is_test) {
+        // TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced
+        auto stats_md = dnnl::memory::desc(
+            {begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType<float>(),
+            platform::MKLDNNFormatForSize(dims.size() - 1,
+                                          MKLDNNMemoryFormat::nchw));
+        this->AcquireForwardPrimitiveDescriptor(
+            dnnl::prop_kind::forward_training, md, stats_md, epsilon, flags);
+      } else {
+        this->AcquireForwardPrimitiveDescriptor(
+            dnnl::prop_kind::forward_inference, md, epsilon, flags);
+      }
+    }
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory() {
+    return this->AcquireMemoryFromPrimitive("@scaleshift_mem_p");
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
+      std::vector<float>& scaleshift_data) {
+    // scaleshift_data comes from temporary buffer so we need to copy it into
+    // created memory primitivie
+    auto scaleshift_mem = this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(), "@scaleshift_mem_p");
+    auto data_ptr = scaleshift_mem->get_data_handle();
+    std::size_t num_bytes = scaleshift_data.size() * sizeof(float);
+    std::memcpy(data_ptr, scaleshift_data.data(), num_bytes);
+    return scaleshift_mem;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireMeanMemory(framework::Tensor* mean) {
+    T* mean_data = mean->mutable_data<T>(this->place_,
+                                         this->fwd_pd_->mean_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
+                                            mean_data, "@mean_mem_p");
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
+      framework::Tensor* variance) {
+    T* variance_data = variance->mutable_data<T>(
+        this->place_, this->fwd_pd_->variance_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
+                                            variance_data, "@variance_mem_p");
+  }
+};
+
+template <typename T>
+class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    auto src_tz = paddle::framework::vectorize(x->dims());
+    PADDLE_ENFORCE_EQ(begin_norm_axis, (src_tz.size() - 1),
+                      platform::errors::InvalidArgument(
+                          "MKL-DNN Layer Norm supports only last logical "
+                          "axis:%d as begin_norm_axis.",
+                          (src_tz.size() - 1)));
+
+    y->mutable_data<T>(ctx.GetPlace());
+    const bool with_scaleshift = (scale && bias);
+    dnnl::normalization_flags flags{};
+
+    if (with_scaleshift) {
+      flags |= dnnl::normalization_flags::use_scale_shift;
+    }
+
+    LayerNormMKLDNNHandler<T> handler(src_tz, epsilon, flags, is_test,
+                                      x->format(), dev_ctx, ctx.GetPlace(),
+                                      ctx.OutputName("Y"));
+
+    auto src_memory = handler.AcquireSrcMemory(x);
+    auto dst_memory = handler.AcquireDstMemory(y);
+
+    auto layer_norm_p = handler.AcquireForwardPrimitive();
+
+    dnnl::stream astream(dev_ctx.GetEngine());
+    std::unordered_map<int, dnnl::memory> args;
+
+    args.insert({DNNL_ARG_SRC, *src_memory});
+    args.insert({DNNL_ARG_DST, *dst_memory});
+
+    if (!is_test) {
+      auto* mean = ctx.Output<Tensor>("Mean");
+      auto* var = ctx.Output<Tensor>("Variance");
+      mean->mutable_data<T>(ctx.GetPlace());
+      var->mutable_data<T>(ctx.GetPlace());
+
+      auto mean_memory = handler.AcquireMeanMemory(mean);
+      auto variance_memory = handler.AcquireVarianceMemory(var);
+
+      args.insert({DNNL_ARG_MEAN, *mean_memory});
+      args.insert({DNNL_ARG_VARIANCE, *variance_memory});
+    }
+
+    auto scaleshift_memory = handler.AcquireScaleShiftMemory();
+    if (with_scaleshift) {
+      if (scaleshift_memory == nullptr || !is_test) {
+        auto scale_tz = paddle::framework::vectorize(scale->dims());
+        const unsigned int C = scale_tz[0];
+
+        // MKLDNN requires a single piece of memory for scale and shift/bias
+        // data
+        std::vector<float> scaleshift_data;
+        scaleshift_data.reserve(2 * C);
+        scaleshift_data.insert(scaleshift_data.begin(), scale->data<float>(),
+                               scale->data<float>() + C);
+
+        scaleshift_data.insert(scaleshift_data.end(), bias->data<float>(),
+                               bias->data<float>() + C);
+
+        scaleshift_memory = handler.AcquireScaleShiftMemory(scaleshift_data);
+      }
+      args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory});
+    }
+
+    layer_norm_p->execute(astream, args);
+    astream.wait();
+
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// TODO(jczaja): Enable FP32 when performance is good
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 54f8cb1dc88428..8649b90321c13b 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -190,6 +190,12 @@ class MKLDNNHandlerT {
     }
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      const std::string& suffix) {
+    return std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx_.GetBlob(key_ + suffix));
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
       mkldnn::memory::desc md, void* ptr, const std::string& suffix) {
     const auto local_key = key_ + suffix;
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
new file mode 100644
index 00000000000000..dc881a57521124
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
@@ -0,0 +1,146 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from paddle.fluid.tests.unittests.test_layer_norm_op import *
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle import enable_static
+from functools import reduce
+
+from paddle.fluid.tests.unittests.mkldnn.test_layer_norm_mkldnn_op import TestLayerNormMKLDNNOp
+from paddle.fluid.tests.unittests.mkldnn.test_layer_norm_mkldnn_op import _reference_layer_norm_naive
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLayerNormBF16MKLDNNOp(TestLayerNormMKLDNNOp):
+    def __assert_close(self, tensor, np_array, msg, rtol=2e-02, atol=2):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor), np_array, rtol=rtol, atol=atol), msg)
+
+    def check_forward(self,
+                      shape,
+                      begin_norm_axis,
+                      with_scale_bias=True,
+                      with_is_test=False):
+        # attr
+        epsilon = 0.00001
+        x_shape = shape
+        D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+        scale_shape = [D]
+
+        np.random.seed(123)
+        x = np.random.random_sample(x_shape).astype(np.float32)
+        x_bf16 = convert_float_to_uint16(x)
+
+        if with_scale_bias:
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
+        else:
+            scale = np.array([])
+            bias = np.array([])
+
+        # reference forward & backward
+        y, mean, variance = _reference_layer_norm_naive(x, scale, bias, epsilon,
+                                                        begin_norm_axis)
+
+        y_bf16 = convert_float_to_uint16(y)
+
+        var_dict = locals()
+        var_names = ['x_bf16', 'mean', 'variance', 'y_bf16']
+        if with_scale_bias:
+            var_names.append('scale')
+            var_names.append('bias')
+        ground_truth = {name: var_dict[name] for name in var_names}
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            block = program.global_block()
+
+            # scale and bias are fp32 and other vars are of bf16
+            for name in ground_truth:
+                if name == 'x_bf16' or name == 'y_bf16':
+                    block.create_var(
+                        name=name,
+                        dtype='uint16',
+                        shape=ground_truth[name].shape)
+                else:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+
+            inputs = {"X": block.var('x_bf16')}
+            if with_scale_bias:
+                inputs["Scale"] = block.var('scale')
+                inputs["Bias"] = block.var('bias')
+
+            block.append_op(
+                type="layer_norm",
+                inputs=inputs,
+                outputs={
+                    "Y": block.var('y_bf16'),
+                    "Mean": block.var('mean'),  # share the same memory
+                    "Variance": block.var('variance'),  # share the same memory
+                },
+                attrs={
+                    "epsilon": epsilon,
+                    "begin_norm_axis": begin_norm_axis,
+                    "use_mkldnn": True,
+                    "is_test": with_is_test
+                })
+
+            exe = fluid.Executor(core.CPUPlace())
+
+            input_list = ['x_bf16']
+            if with_scale_bias:
+                input_list.append('scale')
+                input_list.append('bias')
+
+            out = exe.run(program,
+                          feed={name: var_dict[name]
+                                for name in input_list},
+                          fetch_list=['y_bf16', 'mean', 'variance'])
+            self.__assert_close(y_bf16, out[0], "y_bf16", 2)
+            if not with_is_test:
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+
+    def test_check_forward_with_is_test(self):
+        self.check_forward(
+            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
+
+    # TODO (jczaja): Enable those to test when enabling training using bf16
+    def test_check_forward_with_scale_and_bias(self):
+        pass
+
+    def test_check_forward_without_scale_and_bias(self):
+        pass
+
+
+if __name__ == "__main__":
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py
new file mode 100644
index 00000000000000..d20fb003ee93b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from paddle.fluid.tests.unittests.test_layer_norm_op import *
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle import enable_static
+from functools import reduce
+
+from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [N, D]
+    if scale.size == 0 and beta.size == 0:
+        scale = np.ones([1, D])
+        beta = np.zeros([1, D])
+    else:
+        scale = scale.reshape([1, D])
+        beta = beta.reshape([1, D])
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = scale * np.divide((x - mean.reshape([N, 1])),
+                               (np.sqrt(var)).reshape([N, 1])) + beta
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+class TestLayerNormMKLDNNOp(unittest.TestCase):
+    def setUp(self):
+        self.use_mkldnn = True
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward(self,
+                      shape,
+                      begin_norm_axis,
+                      with_scale_bias=True,
+                      with_is_test=False):
+        # attr
+        epsilon = 0.00001
+        x_shape = shape
+        D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+        scale_shape = [D]
+
+        np.random.seed(123)
+        x = np.random.random_sample(x_shape).astype(np.float32)
+
+        if with_scale_bias:
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
+        else:
+            scale = np.array([])
+            bias = np.array([])
+
+        # reference forward & backward
+        y, mean, variance = _reference_layer_norm_naive(x, scale, bias, epsilon,
+                                                        begin_norm_axis)
+
+        var_dict = locals()
+        var_names = ['x', 'mean', 'variance', 'y']
+        if with_scale_bias:
+            var_names.append('scale')
+            var_names.append('bias')
+        ground_truth = {name: var_dict[name] for name in var_names}
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            block = program.global_block()
+
+            for name in ground_truth:
+                block.create_var(
+                    name=name, dtype='float32', shape=ground_truth[name].shape)
+
+            inputs = {"X": block.var('x')}
+            if with_scale_bias:
+                inputs["Scale"] = block.var('scale')
+                inputs["Bias"] = block.var('bias')
+
+            block.append_op(
+                type="layer_norm",
+                inputs=inputs,
+                outputs={
+                    "Y": block.var('y'),
+                    "Mean": block.var('mean'),  # share the same memory
+                    "Variance": block.var('variance'),  # share the same memory
+                },
+                attrs={
+                    "epsilon": epsilon,
+                    "begin_norm_axis": begin_norm_axis,
+                    "use_mkldnn": True,
+                    "is_test": with_is_test
+                })
+
+            exe = fluid.Executor(core.CPUPlace())
+
+            input_list = ['x']
+            if with_scale_bias:
+                input_list.append('scale')
+                input_list.append('bias')
+
+            out = exe.run(program,
+                          feed={name: var_dict[name]
+                                for name in input_list},
+                          fetch_list=['y', 'mean', 'variance'])
+            self.__assert_close(y, out[0], "y")
+            if not with_is_test:
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+
+    def test_check_forward_with_scale_and_bias(self):
+        self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+    def test_check_forward_without_scale_and_bias(self):
+        self.check_forward(
+            shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False)
+
+    def test_check_forward_with_is_test(self):
+        self.check_forward(
+            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
+
+
+if __name__ == "__main__":
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
index 05d739ae1f3f34..c71baad0c7040a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -25,7 +25,7 @@
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
-class TestSumMKLDNN(TestSumOp):
+class TestSumBF16MKLDNN(TestSumOp):
     def setUp(self):
         self.op_type = "sum"
         self.use_mkldnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 8df7ea35ec1164..d2c07c185dd992 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -117,8 +117,12 @@ def check_forward_backward(self,
                                begin_norm_axis,
                                has_scale=True,
                                has_bias=True,
-                               y_grad_scale=1.0):
-        def test_with_place(place, shape, begin_norm_axis):
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
             # attr
             epsilon = 0.00001
             x_shape = shape
@@ -181,7 +185,8 @@ def test_with_place(place, shape, begin_norm_axis):
                     },
                     attrs={
                         "epsilon": epsilon,
-                        "begin_norm_axis": begin_norm_axis
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
                     })
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 1f153442aff6c6..5fe1cc722e8753 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -293,6 +293,8 @@
     'test_label_smooth_op',
     'test_lamb_op',
     'test_layer_norm_op',
+    'test_layer_norm_mkldnn_op',
+    'test_layer_norm_bf16_mkldnn_op',
     'test_layer_norm_op_v2',
     'test_learning_rate_scheduler',
     'test_linear_interp_op',

From e4f9415338d27fff9bf34424acdd8c19608be5c6 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Tue, 17 Nov 2020 20:17:14 +0800
Subject: [PATCH 39/56] update doc, test=document_fix (#28498)

---
 python/paddle/distributed/collective.py | 6 ------
 python/paddle/tensor/manipulation.py    | 3 ---
 2 files changed, 9 deletions(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index b631f7bbe9d110..cb3c37975ddf44 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -107,7 +107,6 @@ def broadcast(tensor, src, group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -165,7 +164,6 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
             from paddle.distributed import ReduceOp
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -240,7 +238,6 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -323,7 +320,6 @@ def all_gather(tensor_list, tensor, group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             tensor_list = []
@@ -397,7 +393,6 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -463,7 +458,6 @@ def barrier(group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
-            paddle.disable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             paddle.distributed.barrier()
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index bdda90315ac9c7..a0e5e681c76e9f 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1098,7 +1098,6 @@ def tile(x, repeat_times, name=None):
 
             import paddle
 
-            paddle.disable_static()
             data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
@@ -1193,8 +1192,6 @@ def expand_as(x, y, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             data_x = paddle.to_tensor([1, 2, 3], 'int32')
             data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
             out = paddle.expand_as(data_x, data_y)

From b6f86b849138d56a2e170a619979e81420bdea19 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Tue, 17 Nov 2020 20:47:42 +0800
Subject: [PATCH 40/56] Fix Using "isinstance" in Loop, test=develop (#28641)

Fix a bug that used in PaddleGAN model which used `isinstance` in a for loop
---
 .../dygraph_to_static/loop_transformer.py     |  23 ++++
 .../dygraph_to_static/test_isinstance.py      | 112 ++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 0b6c7c45b3804f..b25ff8360be0ca 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -22,6 +22,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import generate_name_node
 from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor
@@ -84,6 +85,9 @@ def __init__(self, root_node):
         self.condition_vars = defaultdict(set)
         self.in_condition = False
 
+        # Some names are types, we shouldn't record them as loop var names.
+        self.type_vars = set()
+
         self.static_analysis_visitor = StaticAnalysisVisitor(root_node)
         self.node_to_wrapper_map = self.static_analysis_visitor.get_node_to_wrapper_map(
         )
@@ -249,6 +253,18 @@ def visit_While(self, node):
         self.generic_visit(node)
         self.current_loop.pop()
 
+    def visit_Call(self, node):
+        # Store type var names such as "isinstance(x, some_type_names)" and
+        # Remove them later
+        if isinstance(node.func, gast.Name) and node.func.id == 'isinstance':
+            type_node = node.args[1]
+            if isinstance(type_node, gast.Tuple):
+                for element in type_node.elts:
+                    self.type_vars.add(ast_to_source_code(element))
+            else:
+                self.type_vars.add(ast_to_source_code(type_node))
+        self.generic_visit(node)
+
     def _var_nodes_to_names(self, node_set, ctx_filter_set=None):
         ret = set()
         for node in node_set:
@@ -290,6 +306,7 @@ def _remove_unnecessary_vars(self, loop_vars, loop_node):
         Remove unnecessary vars from before_loop_vars, after_loop_vars or in_loop_vars about loop_node.
             1. Remove target vars of gast.For from before_loop_vars or after_loop_vars.
             2. Remove vars only in gast.comprehension.
+            3. Remove vars that are type names, for example: "isinstance(x, var_type_name)"
         :param loop_vars: before_loop_vars, after_loop_vars or in_loop_vars of loop_node.
         :param loop_node: Current loop node.
         """
@@ -361,6 +378,12 @@ def _remove_unnecessary_vars(self, loop_vars, loop_node):
                 target_vars_of_for_node.add(var)
 
         removed_vars = target_vars_of_for_node | vars_of_list_generator
+
+        # 3. Remove var type names which are stored in self.type_vars
+        for var in loop_vars:
+            if ast_to_source_code(var) in self.type_vars:
+                removed_vars.add(var)
+
         return loop_vars - removed_vars
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py
new file mode 100644
index 00000000000000..a838ac6842aba9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.nn as nn
+
+
+class SimpleReturnLayer(nn.Layer):
+    def forward(self, x):
+        return x
+
+
+class AddAttrLayer(nn.Layer):
+    def __init__(self):
+        super(AddAttrLayer, self).__init__()
+        self.attr = None
+
+    def forward(self, x):
+        out = x + self.attr
+        return out
+
+
+class IsInstanceLayer(nn.Layer):
+    def __init__(self, layer):
+        super(IsInstanceLayer, self).__init__()
+        self.layer = layer
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        if isinstance(self.layer, (AddAttrLayer, )):
+            self.layer.attr = x
+        res = self.layer(x)
+        return res
+
+
+class SequentialLayer(nn.Layer):
+    def __init__(self, layers):
+        super(SequentialLayer, self).__init__()
+        self.layers = nn.LayerList(layers)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        res = x
+        for layer in self.layers:
+            if isinstance(layer, AddAttrLayer):
+                layer.attr = x
+            res = layer(res)
+        return res
+
+
+def train(model, to_static):
+    prog_trans = paddle.jit.ProgramTranslator.get_instance()
+    prog_trans.enable(to_static)
+
+    x = paddle.ones(shape=[2, 3], dtype='int32')
+    out = model(x)
+
+    return out.numpy()
+
+
+class TestIsinstance(unittest.TestCase):
+    def test_isinstance_simple_return_layer(self):
+        model = IsInstanceLayer(SimpleReturnLayer())
+        self._test_model(model)
+
+    def test_isinstance_add_attr_layer(self):
+        model = IsInstanceLayer(AddAttrLayer())
+        self._test_model(model)
+
+    def test_sequential_layer(self):
+        layers = []
+        for i in range(5):
+            layers.append(SimpleReturnLayer())
+            layers.append(AddAttrLayer())
+        model = SequentialLayer(layers)
+        self._test_model(model)
+
+    def _test_model(self, model):
+        st_out = train(model, to_static=True)
+        dy_out = train(model, to_static=False)
+        self.assertTrue(
+            np.allclose(dy_out, st_out),
+            msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 11e32baf1e0f83374e3563e7541aab085e18b1cc Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 18 Nov 2020 00:18:17 +0800
Subject: [PATCH 41/56] Add matmtl_v2 to amp list (#28693)

* add matmtl_v2 to amp list

* support dygraph
---
 python/paddle/fluid/contrib/mixed_precision/fp16_lists.py | 1 +
 python/paddle/fluid/dygraph/amp/auto_cast.py              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index a9f080c514dff0..8c467a4969e295 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -74,6 +74,7 @@ def _update_list(self):
 white_list = {
     'conv2d',
     'matmul',
+    'matmul_v2',
     'mul',
 }
 
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index ffb4d9f16f29f3..4ff08337875c03 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -27,6 +27,7 @@
 WHITE_LIST = {
     'conv2d',
     'matmul',
+    'matmul_v2',
     'mul',
 }
 

From 5050e761b83446b9642cf1fe586c7ae59c5ec2d8 Mon Sep 17 00:00:00 2001
From: Bai Yifan <me@ethanbai.com>
Date: Wed, 18 Nov 2020 11:20:04 +0800
Subject: [PATCH 42/56] Support user-defined activation/weight quantize and
 preprocess. (#28570)

* support user-defined quant and preprocess
---
 .../slim/quantization/imperative/qat.py       |  46 +++-
 .../slim/quantization/imperative/quant_nn.py  | 118 ++++++---
 .../tests/test_imperative_qat_user_defined.py | 248 ++++++++++++++++++
 3 files changed, 373 insertions(+), 39 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 7fc177e7ad7654..cae24177232675 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -59,7 +59,11 @@ def __init__(self,
                  weight_quantize_type='abs_max',
                  activation_quantize_type='moving_average_abs_max',
                  moving_rate=0.9,
-                 quantizable_layer_type=['Conv2D', 'Linear']):
+                 quantizable_layer_type=['Conv2D', 'Linear'],
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
         """
         The constructor for ImperativeQuantAware.
 
@@ -81,7 +85,28 @@ def __init__(self,
             quantizable_op_type(list[str]): List the type of layers that will be quantized. 
                 Default is ['Conv2D', 'Linear']. The quantizable_op_type in
                 QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-
+            weight_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
+                weight before quantization. Using this can quickly test if user's
+                preprocess method works or not. The input is non-quantized
+                weight and function returns processed weight to be quantized.
+                If None, the weight will be quantized directly. Default is None.
+            act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
+                activation before quantization. Using this can quickly test if user's
+                preprocess method works or not. The input is non-quantized
+                activation and function returns processed activation to be quantized.
+                If None, the activation will be quantized directly. Default is None.
+            weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize weight.
+                Using this can quickly test if user's quantization method works or not.
+                In this layer, user should both define quantization method and
+                dequantization method, that is, the function's input is non-quantized
+                weight and returns dequantized weight. If None, will use
+                quantization op defined by 'weight_quantize_type'. Default is None.
+            act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize activation.
+                Using this can quickly test if user's quantization method works or not.
+                In this layer, user should both define quantization method and
+                dequantization method, that is, the function's input is non-quantized
+                activation and returns dequantized activation. If None, will use
+                quantization op defined by 'activation_quantize_type'. Default is None.
 
         Examples:
         .. code-block:: python
@@ -118,6 +143,19 @@ def __init__(self,
         self._activation_bits = activation_bits
         self._moving_rate = moving_rate
 
+        self._weight_pre_layer = weight_preprocess_layer
+        self._act_pre_layer = act_preprocess_layer
+        self._weight_quant_layer = weight_quantize_layer
+        self._act_quant_layer = act_quantize_layer
+
+        t_check = lambda method: method is None or issubclass(method, dygraph.layers.Layer)
+        assert t_check(
+            self._weight_pre_layer), "weight_preprocess should be nn.Layer"
+        assert t_check(self._act_pre_layer), "act_preprocess should be nn.Layer"
+        assert t_check(
+            self._weight_quant_layer), "weight_quantize should be nn.Layer"
+        assert t_check(self._act_quant_layer), "act_quantize should be nn.Layer"
+
         quant_type = {
             'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
         }
@@ -189,7 +227,9 @@ def _get_quantized_counterpart(self, layer):
 
         quantized_layer = quant_nn.__dict__[quantized_counterpart[index]](
             layer, self._weight_bits, self._activation_bits, self._moving_rate,
-            self._weight_quantize_type, self._activation_quantize_type)
+            self._weight_quantize_type, self._activation_quantize_type,
+            self._weight_pre_layer, self._act_pre_layer,
+            self._weight_quant_layer, self._act_quant_layer)
         return quantized_layer
 
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index bbaae56439eb66..79138febd0ce87 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -332,7 +332,11 @@ def __init__(self,
                  activation_bits=8,
                  moving_rate=0.9,
                  weight_quantize_type='abs_max',
-                 activation_quantize_type='abs_max'):
+                 activation_quantize_type='abs_max',
+                 weight_pre_layer=None,
+                 act_pre_layer=None,
+                 weight_quant_layer=None,
+                 act_quant_layer=None):
         super(QuantizedConv2D, self).__init__()
         # For Conv2D
         self._groups = getattr(layer, '_groups')
@@ -347,26 +351,44 @@ def __init__(self,
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._conv2d_quant_axis = 0
-        self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type,
-            name=self.weight.name,
-            moving_rate=moving_rate,
-            quant_bits=weight_bits,
-            dtype=self._dtype,
-            quant_on_weight=True,
-            channel_num=self.weight.shape[self._conv2d_quant_axis],
-            quant_axis=self._conv2d_quant_axis)
-        self._fake_quant_input = _get_fake_quant_type(
-            activation_quantize_type,
-            name=layer.full_name(),
-            moving_rate=moving_rate,
-            quant_bits=activation_bits,
-            dtype=self._dtype,
-            quant_on_weight=False)
+
+        if weight_quant_layer is not None:
+            self._fake_quant_weight = weight_quant_layer()
+        else:
+            self._fake_quant_weight = _get_fake_quant_type(
+                weight_quantize_type,
+                name=self.weight.name,
+                moving_rate=moving_rate,
+                quant_bits=weight_bits,
+                dtype=self._dtype,
+                quant_on_weight=True,
+                channel_num=self.weight.shape[self._conv2d_quant_axis],
+                quant_axis=self._conv2d_quant_axis)
+        if act_quant_layer is not None:
+            self._fake_quant_input = act_quant_layer()
+        else:
+            self._fake_quant_input = _get_fake_quant_type(
+                activation_quantize_type,
+                name=layer.full_name(),
+                moving_rate=moving_rate,
+                quant_bits=activation_bits,
+                dtype=self._dtype,
+                quant_on_weight=False)
+
+        self._act_preprocess = act_pre_layer(
+        ) if act_pre_layer is not None else None
+        self._weight_preprocess = weight_pre_layer(
+        ) if weight_pre_layer is not None else None
 
     def forward(self, input):
+        if self._act_preprocess is not None:
+            input = self._act_preprocess(input)
         quant_input = self._fake_quant_input(input)
-        quant_weight = self._fake_quant_weight(self.weight)
+
+        weight = self.weight
+        if self._weight_preprocess is not None:
+            weight = self._weight_preprocess(self.weight)
+        quant_weight = self._fake_quant_weight(weight)
 
         if in_dygraph_mode() and self._l_type == 'conv2d':
             attrs = ('strides', self._stride, 'paddings', self._padding,
@@ -428,7 +450,11 @@ def __init__(self,
                  activation_bits=8,
                  moving_rate=0.9,
                  weight_quantize_type='abs_max',
-                 activation_quantize_type='abs_max'):
+                 activation_quantize_type='abs_max',
+                 weight_pre_layer=None,
+                 act_pre_layer=None,
+                 weight_quant_layer=None,
+                 act_quant_layer=None):
         super(QuantizedLinear, self).__init__()
         # For Linear
         self._act = getattr(layer, '_act')
@@ -437,26 +463,46 @@ def __init__(self,
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._linear_quant_axis = 1
-        self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type,
-            name=self.weight.name,
-            moving_rate=moving_rate,
-            quant_bits=weight_bits,
-            dtype=self._dtype,
-            quant_on_weight=True,
-            channel_num=self.weight.shape[self._linear_quant_axis],
-            quant_axis=self._linear_quant_axis)
-        self._fake_quant_input = _get_fake_quant_type(
-            activation_quantize_type,
-            name=layer.full_name(),
-            moving_rate=moving_rate,
-            quant_bits=activation_bits,
-            dtype=self._dtype,
-            quant_on_weight=False)
+
+        if weight_quant_layer is not None:
+            self._fake_quant_weight = weight_quant_layer()
+        else:
+            self._fake_quant_weight = _get_fake_quant_type(
+                weight_quantize_type,
+                name=self.weight.name,
+                moving_rate=moving_rate,
+                quant_bits=weight_bits,
+                dtype=self._dtype,
+                quant_on_weight=True,
+                channel_num=self.weight.shape[self._linear_quant_axis],
+                quant_axis=self._linear_quant_axis)
+
+        if act_quant_layer is not None:
+            self._fake_quant_input = act_quant_layer()
+        else:
+            self._fake_quant_input = _get_fake_quant_type(
+                activation_quantize_type,
+                name=layer.full_name(),
+                moving_rate=moving_rate,
+                quant_bits=activation_bits,
+                dtype=self._dtype,
+                quant_on_weight=False)
+
+        self._act_preprocess = act_pre_layer(
+        ) if act_pre_layer is not None else None
+        self._weight_preprocess = weight_pre_layer(
+        ) if weight_pre_layer is not None else None
 
     def forward(self, input):
+        if self._act_preprocess is not None:
+            input = self._act_preprocess(input)
         quant_input = self._fake_quant_input(input)
-        quant_weight = self._fake_quant_weight(self.weight)
+
+        weight = self.weight
+        if self._weight_preprocess is not None:
+            weight = self._weight_preprocess(self.weight)
+        quant_weight = self._fake_quant_weight(weight)
+
         if in_dygraph_mode():
             pre_bias = _varbase_creator(dtype=input.dtype)
             core.ops.matmul(quant_input, quant_weight, pre_bias, 'transpose_X',
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
new file mode 100644
index 00000000000000..29b69bbe0f8ea2
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -0,0 +1,248 @@
+#   copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.nn as nn
+from paddle.optimizer import Adam
+from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.nn import Sequential
+from paddle.fluid.dygraph import Conv2D
+from paddle.nn import Pool2D
+from paddle.fluid.dygraph import Linear
+from paddle.fluid.log_helper import get_logger
+
+os.environ["CPU_NUM"] = "1"
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class PACT(nn.Layer):
+    def __init__(self, init_value=20):
+        super(PACT, self).__init__()
+        alpha_attr = paddle.ParamAttr(
+            name=self.full_name() + ".pact",
+            initializer=paddle.nn.initializer.Constant(value=init_value))
+        self.alpha = self.create_parameter(
+            shape=[1], attr=alpha_attr, dtype='float32')
+
+    def forward(self, x):
+        out_left = paddle.nn.functional.relu(x - self.alpha)
+        out_right = paddle.nn.functional.relu(-self.alpha - x)
+        x = x - out_left + out_right
+        return x
+
+
+class CustomQAT(nn.Layer):
+    def __init__(self):
+        super(CustomQAT, self).__init__()
+        attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+        self.u_param = self.create_parameter(
+            shape=[1], attr=attr, dtype='float32')
+        self.l_param = self.create_parameter(
+            shape=[1], attr=attr, dtype='float32')
+        self.alpha_param = self.create_parameter(
+            shape=[1], attr=attr, dtype='float32')
+        self.upper = self.create_parameter(
+            shape=[1], attr=attr, dtype='float32')
+        self.upper.stop_gradient = True
+        self.lower = self.create_parameter(
+            shape=[1], attr=attr, dtype='float32')
+        self.lower.stop_gradient = True
+
+    def forward(self, x):
+        def clip(x, upper, lower):
+            x = x + paddle.nn.functional.relu(lower - x)
+            x = x - paddle.nn.functional.relu(x - upper)
+            return x
+
+        def phi_function(x, mi, alpha, delta):
+            s = 1 / (1 - alpha)
+            k = paddle.log(2 / alpha - 1) * (1 / delta)
+            x = (paddle.tanh((x - mi) * k)) * s
+            return x
+
+        def dequantize(x, lower_bound, delta, interval):
+            x = ((x + 1) / 2 + interval) * delta + lower_bound
+            return x
+
+        bit = 8
+        bit_range = 2**bit - 1
+
+        paddle.assign(self.upper * 0.9 + self.u_param * 0.1, self.upper)
+        paddle.assign(self.lower * 0.9 + self.l_param * 0.1, self.lower)
+        x = clip(x, self.upper, self.lower)
+        delta = (self.upper - self.lower) / bit_range
+        interval = (x - self.lower) / delta
+        mi = (interval + 0.5) * delta + self.l_param
+        x = phi_function(x, mi, self.alpha_param, delta)
+        x = dequantize(x, self.l_param, delta, interval)
+        return x
+
+
+class ImperativeLenet(paddle.nn.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400, output_dim=120),
+            Linear(
+                input_dim=120, output_dim=84),
+            Linear(
+                input_dim=84, output_dim=num_classes,
+                act=classifier_activation))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = paddle.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestUserDefinedActPreprocess(unittest.TestCase):
+    def setUp(self):
+        _logger.info("test act_preprocess")
+        self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
+
+    def test_quant_aware_training(self):
+        imperative_qat = self.imperative_qat
+        seed = 1
+        np.random.seed(seed)
+        paddle.static.default_main_program().random_seed = seed
+        paddle.static.default_startup_program().random_seed = seed
+        lenet = ImperativeLenet()
+        fixed_state = {}
+        param_init_map = {}
+        for name, param in lenet.named_parameters():
+            p_shape = param.numpy().shape
+            p_value = param.numpy()
+            if name.endswith("bias"):
+                value = np.zeros_like(p_value).astype('float32')
+            else:
+                value = np.random.normal(
+                    loc=0.0, scale=0.01,
+                    size=np.product(p_shape)).reshape(p_shape).astype('float32')
+            fixed_state[name] = value
+            param_init_map[param.name] = value
+        lenet.set_dict(fixed_state)
+
+        imperative_qat.quantize(lenet)
+        adam = Adam(learning_rate=0.001, parameters=lenet.parameters())
+        dynamic_loss_rec = []
+
+        def train(model):
+            adam = Adam(learning_rate=0.001, parameters=model.parameters())
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                model.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = paddle.to_tensor(x_data)
+                    label = paddle.to_tensor(y_data)
+                    out = model(img)
+                    acc = paddle.metric.accuracy(out, label, k=1)
+                    loss = nn.functional.loss.cross_entropy(out, label)
+                    avg_loss = paddle.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    model.clear_gradients()
+                    if batch_id % 50 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+                        break
+
+        def test(model):
+            model.eval()
+            avg_acc = [[], []]
+            for batch_id, data in enumerate(test_reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = paddle.to_tensor(x_data)
+                label = paddle.to_tensor(y_data)
+
+                out = model(img)
+                acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
+                acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
+                avg_acc[0].append(acc_top1.numpy())
+                avg_acc[1].append(acc_top5.numpy())
+                if batch_id % 100 == 0:
+                    _logger.info(
+                        "Test | step {}: acc1 = {:}, acc5 = {:}".format(
+                            batch_id, acc_top1.numpy(), acc_top5.numpy()))
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=512, drop_last=True)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=512)
+        train(lenet)
+        test(lenet)
+
+
+class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
+    def setUp(self):
+        _logger.info("test weight_preprocess")
+        self.imperative_qat = ImperativeQuantAware(weight_preprocess_layer=PACT)
+
+
+class TestUserDefinedActQuantize(TestUserDefinedActPreprocess):
+    def setUp(self):
+        _logger.info("test act_quantize")
+        self.imperative_qat = ImperativeQuantAware(act_quantize_layer=CustomQAT)
+
+
+class TestUserDefinedWeightQuantize(TestUserDefinedActPreprocess):
+    def setUp(self):
+        _logger.info("test weight_quantize")
+        self.imperative_qat = ImperativeQuantAware(
+            weight_quantize_layer=CustomQAT)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 358d6bc90f0c4b463d66bc7ae25116b0f814dfb0 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 18 Nov 2020 12:46:35 +0800
Subject: [PATCH 43/56] Fix test_weight_decay_extend random failed on windows
 (#28643)

* add debuging code

* change seed & add debug message
---
 .../contrib/tests/test_weight_decay_extend.py  | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 65d400c63262bf..9eb2fe6cbd1a15 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -24,6 +24,8 @@
 
 paddle.enable_static()
 
+SEED = 2020
+
 
 def fake_imdb_reader(word_dict_size,
                      sample_num,
@@ -87,6 +89,11 @@ def bow_net(data,
 
 class TestWeightDecay(unittest.TestCase):
     def setUp(self):
+        # set seed
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+        # configs
         self.word_dict_len = 5147
         batch_size = 2
         reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
@@ -114,8 +121,6 @@ def run_program(self, place, feed_list):
         return param_sum
 
     def check_weight_decay(self, place, model):
-        paddle.seed(1)
-        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
 
@@ -137,8 +142,6 @@ def check_weight_decay(self, place, model):
         return param_sum
 
     def check_weight_decay2(self, place, model):
-        paddle.seed(1)
-        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
 
@@ -173,7 +176,12 @@ def test_weight_decay(self):
             param_sum2 = self.check_weight_decay2(place, model)
 
             for i in range(len(param_sum1)):
-                assert np.isclose(a=param_sum1[i], b=param_sum2[i], rtol=5e-5)
+                self.assertTrue(
+                    np.allclose(param_sum1[i], param_sum2[i]),
+                    "Current place: {}, i: {}, sum1: {}, sum2: {}".format(
+                        place, i, param_sum1[i][~np.isclose(param_sum1[
+                            i], param_sum2[i])], param_sum2[i][~np.isclose(
+                                param_sum1[i], param_sum2[i])]))
 
 
 if __name__ == '__main__':

From f78211d08233b6b66b706979f867a462d8aa8920 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Wed, 18 Nov 2020 12:51:06 +0800
Subject: [PATCH 44/56] Add delta file for precision test

---
 tools/get_pr_ut.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 970f89551c579b..b166573ffe4db5 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -65,6 +65,12 @@ def get_pr_ut(self):
             else:
                 ut_list.extend(file_ut_map.get(f))
         ut_list = list(set(ut_list))
+        cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/prec_delta'
+        os.system(cmd)
+        with open('prec_delta') as delta:
+            for ut in delta:
+                ut_list.append(ut.rstrip('\r\n'))
+
         return ' '.join(ut_list)
 
 

From 858ffa0c8b6ff6c10b7f62a0a47d56fa7e37362f Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Wed, 18 Nov 2020 13:04:10 +0800
Subject: [PATCH 45/56] Fix the dropout setting when not initialized in rnn_op.
 (#28561)

test=develop
---
 paddle/fluid/operators/rnn_op.cu.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index 568db79722324f..f38bfd5968884c 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -89,15 +89,16 @@ class RNNDescriptors {
 
     // ------------------- cudnn dropout descriptors ---------------------
     size_t state_size;
-    if (!is_test_ && !dropout_state->IsInitialized()) {
+    bool is_initialized = dropout_state->IsInitialized();
+    if (!is_test_ && !is_initialized) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
     }
-    dropout_desc_.descriptor(handle, place, dropout_state->IsInitialized(),
-                             dropout_prob_, is_test_ ? nullptr : dropout_state,
-                             seed_, state_size);
+    dropout_desc_.descriptor(handle, place, is_initialized, dropout_prob_,
+                             is_test_ ? nullptr : dropout_state, seed_,
+                             state_size);
 
 // ------------------- cudnn rnn descriptors ---------------------
 #if CUDNN_VERSION >= 6000

From 7eeb99fe025c0946014956300930461bf3ad8fe9 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 18 Nov 2020 13:09:21 +0800
Subject: [PATCH 46/56] Add basic hook classes for dygraph & implement reduce
 hook (#28584)

* add base hook classes and reduce hook impl

* fix constructor typo

* polish comment format

* refactor baisc hook class design

* polish design details
---
 paddle/fluid/imperative/basic_engine.cc       |  21 +-
 paddle/fluid/imperative/basic_engine.h        |   3 +
 .../fluid/imperative/gradient_accumulator.cc  |   9 +-
 .../fluid/imperative/gradient_accumulator.h   |  49 ++++
 paddle/fluid/imperative/hooks.h               | 233 +++++++++++++++++
 paddle/fluid/imperative/op_base.h             |   2 +-
 paddle/fluid/imperative/tests/CMakeLists.txt  |   1 +
 paddle/fluid/imperative/tests/test_hooks.cc   | 240 ++++++++++++++++++
 paddle/fluid/imperative/variable_wrapper.h    |  82 ++++++
 9 files changed, 637 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/imperative/hooks.h
 create mode 100644 paddle/fluid/imperative/tests/test_hooks.cc

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 9ad30506b2c3a0..e9214a8fea8174 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -114,6 +114,16 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       accumulator->IncreaseRefCnt();
 
+      if (var->HasLeafHooks()) {
+        VLOG(3) << "Grad variable wrapper (" << var->Name()
+                << ") has leaf grad hooks.";
+        PADDLE_ENFORCE_NE(var->HasGradNode(), true,
+                          platform::errors::PermissionDenied(
+                              "Only leaf Tensor's gradient can append hook to "
+                              "Gradientaccumulator."));
+        accumulator->SetPostHooks(var->GetLeafHooks());
+      }
+
       VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() << "("
               << var.get() << ")  with reference count "
               << accumulator->RefCnt();
@@ -204,6 +214,7 @@ void BasicEngine::Execute() {
                                          var->Name()));
 
           if (!var->OverridedStopGradient() && iter->second->RefCnt() == 1) {
+            no_need_run_accumulators_.emplace_back(iter->second.get());
             continue;
           }
 
@@ -220,12 +231,19 @@ void BasicEngine::Execute() {
                     cur_op.place());
       }
 
-      // Step 2: Sum Gradient
+      // Step 2: Sum Gradient & Call Accumulator Hooks
+      for (auto* accumulator : no_need_run_accumulators_) {
+        if (accumulator->HasPostHooks()) {
+          accumulator->CallBackwardPostHooks();
+        }
+      }
+
       for (auto& pair : need_accu_var_list_) {
         pair.first->Add(std::move(pair.second), cur_op.id());
       }
 
       need_accu_var_list_.clear();
+      no_need_run_accumulators_.clear();
 
       VLOG(3) << "Remove op after op " << cur_op.Type() << " runs";
       if (!retain_graph_) {
@@ -258,6 +276,7 @@ void BasicEngine::Clear() {
   node_deps_.clear();
   accumulators_.clear();
   need_accu_var_list_.clear();
+  no_need_run_accumulators_.clear();
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 0906dd4f9236ec..92e7fe7eb8cd79 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -49,6 +49,9 @@ class BasicEngine : public Engine {
       accumulators_;
   std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>>
       need_accu_var_list_;
+  // Accumulators that does not need to perform accumulation operations,
+  // the ref_cnt_=1, corresponding to need_accu_var_list_
+  std::vector<GradientAccumulator*> no_need_run_accumulators_;
   bool retain_graph_;
 };
 
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 07f1868b7fa299..00fd18e5e2564c 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -401,13 +401,15 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
       }
     }
   }
-  ++cur_cnt_;
 
   if (var_->Var().IsType<framework::LoDTensor>()) {
     var_->SetType(framework::proto::VarType::LOD_TENSOR);
   } else if (var_->Var().IsType<framework::SelectedRows>()) {
     var_->SetType(framework::proto::VarType::SELECTED_ROWS);
   }
+
+  // Increase count & call post hooks
+  IncreaseCurCnt();
 }
 
 void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
@@ -520,6 +522,11 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
   } else if (var_->Var().IsType<framework::SelectedRows>()) {
     var_->SetType(framework::proto::VarType::SELECTED_ROWS);
   }
+
+  // call post hooks
+  if (HasPostHooks()) {
+    CallBackwardPostHooks();
+  }
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index a8ccb2a38d3c33..2d0cc6e8921590 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -17,6 +17,8 @@
 #include <memory>
 #include <utility>
 #include <vector>
+
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 
 namespace paddle {
@@ -35,9 +37,43 @@ class GradientAccumulator {
 
   inline size_t RefCnt() const { return ref_cnt_; }
 
+  /* Hook related methods */
+  inline bool HasPostHooks() const { return !post_hooks_.expired(); }
+
+  void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) {
+    PADDLE_ENFORCE_NOT_NULL(
+        hooks, platform::errors::InvalidArgument(
+                   "The hook set to GradientAccumulator is nullptr."));
+
+    auto shared_hooks = post_hooks_.lock();
+    if (shared_hooks != hooks) {
+      PADDLE_ENFORCE_EQ(
+          shared_hooks, nullptr,
+          platform::errors::PermissionDenied(
+              "Cannot set post hooks twice to GradientAccumulator."));
+      post_hooks_ = hooks;
+    }
+  }
+
+  // call backward post hooks, such as reduce hook
+  void CallBackwardPostHooks() {
+    PADDLE_ENFORCE_NE(
+        post_hooks_.expired(), true,
+        platform::errors::NotFound(
+            "The post hooks of GradientAccumulator for Tensor `%s` expired.",
+            var_->Name()));
+    auto shared_hooks = post_hooks_.lock();
+    for (const auto& hook : shared_hooks->backward_hooks()) {
+      VLOG(3) << "call gradient accumulator backward hooks.";
+      (*hook)(var_);
+    }
+  }
+
  protected:
   VariableWrapper* var_;
   size_t ref_cnt_{0};
+
+  std::weak_ptr<LeafVarHookPipeline> post_hooks_;
 };
 
 class EagerGradientAccumulator : public GradientAccumulator {
@@ -47,6 +83,19 @@ class EagerGradientAccumulator : public GradientAccumulator {
   void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id,
            bool unchange_input) override;
 
+ private:
+  inline bool AccumulateCompleted() const { return cur_cnt_ == ref_cnt_; }
+
+  void IncreaseCurCnt() {
+    ++cur_cnt_;
+    VLOG(3) << "IncreaseCurCnt: cur_cnt " << cur_cnt_ << ", ref_cnt "
+            << ref_cnt_;
+    // After all tmp gradient being accumulated to grad var, run hooks
+    if (AccumulateCompleted() && HasPostHooks()) {
+      CallBackwardPostHooks();
+    }
+  }
+
  private:
   size_t cur_cnt_{0};
 };
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
new file mode 100644
index 00000000000000..1211ec6ae6c7bd
--- /dev/null
+++ b/paddle/fluid/imperative/hooks.h
@@ -0,0 +1,233 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace imperative {
+
+class VariableWrapper;
+
+/** [ Basic hook classes ]
+ * s
+ * @brief OpBasePreHook is executed before the grad OpBase is executed,
+ *        taking the input of the current grad OpBase as input, and
+ *        executing python hooks (user-defined) or C++ hooks (developer-defined)
+ *        to achieve the purpose of custom operations on the interior VarBase
+ *        gradient.
+ *
+ * @note  OpBasePreHook will not change the input gradient VarBase.
+ *
+ * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
+ *
+ *        If set OpBase post hook, when the op executed end, the op's output
+ *        gradient may not be the final state, because it may need other op's
+ *        gradient output to accumulated to it. But before op can be executed,
+ *        the gradient output must have been accumulated to final value.
+ *
+ * @note  [Why only can be used for interior VarBase?]
+ *
+ *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
+ *        GradVarBase has no next OpBase to executed, so if need to deal with
+ *        the leaf GradVarBase, cannot use OpBasePreHook. For this case, we
+ *        deal with by GradAccumulatorPostHook.
+ */
+class OpBasePreHook {
+ public:
+  virtual ~OpBasePreHook() = default;
+  virtual VariableWrapperList operator()(
+      const VariableWrapperList& grad_inputs) = 0;
+};
+
+/**
+ * @brief GradAccumulatorPostHook is the Hook that operates on the current
+ *        gradientafter the GradientAccumulator has accumulated the gradient.
+ *        Leaf GradVarBase has no next OpBase, if we want to register hook
+ *        for it, we also need to wait until the leaf GradVarBase accumulation
+ *        is completed, so we can add post hook to GradientAccumulator.
+ *
+ * @note  GradAccumulatorPostHook will change the grad VarBase value.
+ *
+ * @note  Only allow leaf VarBase hold GradientAccumulatorPostHook.
+ */
+class GradAccumulatorPostHook {
+ public:
+  virtual ~GradAccumulatorPostHook() = default;
+  virtual void operator()(VariableWrapper* var) = 0;
+};
+
+/** [ Hook for cpp functions ]
+ *
+ * Here we design three C++ hooks；
+ * 1. CppOpBasePreHook (Implement later):
+ *    - used for developer-defined C++ interior VarBase hooks
+ * 2. CppGradAccumulatorPostHook (Implement later):
+ *    - used for developer-defined C++ leaf VarBase hooks
+ * 3. LambdaGradAccumulatorPostHook:
+ *    - used for VarBase reduce in parallel training
+ *
+ * @note  [Why need two types of GradAccumulatorPostHook? ]
+ *
+ *        There are two types of gradient accumulation:
+ *        1. Gradient accumulation in same batch
+ *        2. Gradient accumulation across batchs
+ *        The order of execution between Hooks and gradient accumulation:
+ *
+ *          [ Gradient accumulation in same batch]
+ *                            |
+ *                [ leaf GradVarBase hooks ]
+ *                            |
+ *          [ Gradient accumulation across batchs ]
+ *                            |
+ *              [ Gradient reduce / allreduce]
+ *
+ *        Because we currently intend to accumulate these two gradient
+ *        accumulation in one GradientAccumulator, We must distinguish between
+ *        two types of hooks.
+ *
+ *        And the LambdaGradAccumulatorPostHook does not allow users to register
+ *        directly, and is currently only used to support the reduce strategy of
+ *        parallel multi-card training.
+ */
+class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
+ public:
+  explicit LambdaGradAccumulatorPostHook(
+      std::function<void(VariableWrapper*)> fn)
+      : fn_(std::move(fn)) {}
+
+  void operator()(VariableWrapper* var) override { fn_(var); }
+
+ private:
+  std::function<void(VariableWrapper*)> fn_;
+};
+
+/* Hooks for python function: in pybind/imperative.cc */
+
+/** Add Python Hooks later:
+ * - PyOpBasePreHook (Implement later): used for user-defined interior python
+ * VarBase hooks
+ * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf
+ * python VarBase hooks
+ */
+
+/** [ Hook Pipeline classes ]
+ *
+ * @note  [Why need hook pipeline classes?]
+ *
+ *        There are 2 purposes for adding Hook pipeline here:
+ *
+ *        1. Make the code implementation cleaner.
+ *
+ *          If there are no Hook pipeline, we need to add 3 hook vector into
+ *          VariableWrapper, 1 hook vector into OpBase, 2 hook vector into
+ *          GradientAccumulator, like:
+ *
+ *          - VariableWrapper:
+ *            std::vector<std::shared_ptr<OpBasePreHook>>
+ *              interior_var_hooks_;
+ *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
+ *              leaf_var_hooks_;
+ *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
+ *              backward_hooks_;
+ *
+ *          - OpBase:
+ *            std::vector<std::weak_ptr<OpBasePreHook>>
+ *              interior_var_hooks_;
+ *
+ *          - GradientAccumulator:
+ *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
+ *              leaf_var_hooks_;
+ *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
+ *              backward_hooks_;
+ *
+ *          This seems more complicated, and std::vector<std::weak_ptr<...>>
+ *          is not easy to destruct.
+ *
+ *        2. Make the code easier to understand.
+ *
+ *          From these two packages, we can clearly understand that we
+ *          have two types of Hooks, respectively for the interior
+ *          gradient var and leaf gradient var inside the backward
+ *          calculation graph.
+ */
+
+class InteriorVarHookPipeline {
+ public:
+  InteriorVarHookPipeline() = default;
+
+  void add_hook(std::unique_ptr<OpBasePreHook>&& hook) {
+    hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::unique_ptr<OpBasePreHook>>& hooks() const {
+    return hooks_;
+  }
+
+  std::vector<std::unique_ptr<OpBasePreHook>>& hooks() { return hooks_; }
+
+ private:
+  std::vector<std::unique_ptr<OpBasePreHook>> hooks_;
+
+  DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline);
+};
+
+class LeafVarHookPipeline {
+ public:
+  LeafVarHookPipeline() = default;
+
+  void add_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
+    hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() const {
+    return hooks_;
+  }
+
+  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() {
+    return hooks_;
+  }
+
+  void add_backward_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
+    backward_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks()
+      const {
+    return backward_hooks_;
+  }
+
+  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks() {
+    return backward_hooks_;
+  }
+
+ private:
+  std::vector<std::unique_ptr<GradAccumulatorPostHook>> hooks_;
+  // NOTE: the `backward` here means the `whole backward process`,
+  // the `backward_hooks_` need to be executed after the `whole backward
+  // process`.
+  std::vector<std::unique_ptr<GradAccumulatorPostHook>> backward_hooks_;
+
+  DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline);
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index a4b57c404ce00b..36185af3a25257 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -176,7 +176,7 @@ class OpBase {
   platform::Place place_;
   size_t id_{-1UL};
 
-  std::vector<std::function<void()>> backward_hooks_;
+  std::weak_ptr<InteriorVarHookPipeline> pre_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index e3c82474e09e2d..a8de1e6b039268 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -11,3 +11,4 @@ cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy se
 cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
+cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
new file mode 100644
index 00000000000000..7bf5f876681bab
--- /dev/null
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -0,0 +1,240 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/basic_engine.h"
+#include "paddle/fluid/imperative/hooks.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+namespace memory = paddle::memory;
+
+DECLARE_bool(sort_sum_gradient);
+
+namespace paddle {
+namespace imperative {
+
+using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
+using var_pair = std::pair<std::string, vb_vector>;
+
+TEST(TestHooks, TestGradVarLeafBackwardHook) {
+  // 1. prepare
+  Tracer tracer;
+  std::shared_ptr<VarBase> x(new VarBase(true, "x"));
+  std::shared_ptr<VarBase> y(new VarBase(true, "y"));
+  std::shared_ptr<VarBase> out(new VarBase(true, "out"));
+  x->SetOverridedStopGradient(false);
+  y->SetOverridedStopGradient(false);
+
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> x_dims = {2, 5};
+  std::vector<int64_t> y_dims = {5, 2};
+
+  auto* x_tensor = x->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_tensor = y->MutableVar()->GetMutable<framework::LoDTensor>();
+
+  x_tensor->Resize(framework::make_ddim(x_dims));
+  auto* mutable_x = x_tensor->mutable_data<float>(place);
+  memory::Copy(place, mutable_x, place, src_data.data(),
+               sizeof(float) * src_data.size());
+
+  y_tensor->Resize(framework::make_ddim(y_dims));
+  auto* mutable_y = y_tensor->mutable_data<float>(place);
+  memory::Copy(place, mutable_y, place, src_data.data(),
+               sizeof(float) * src_data.size());
+
+  var_pair x_pair = var_pair("X", vb_vector(1, x));
+  var_pair y_pair = var_pair("Y", vb_vector(1, y));
+  var_pair out_pair = var_pair("Out", vb_vector(1, out));
+
+  NameVarBaseMap ins = {x_pair, y_pair};
+  NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+
+  // add GradAccumulatorPostHook
+  auto x_var_wrapper = x->SharedVar();
+  x_var_wrapper->AddGradVarLeafBackwardHook(
+      std::unique_ptr<LambdaGradAccumulatorPostHook>(
+          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+            auto* grad_tensor =
+                grad->MutableVar()->GetMutable<framework::LoDTensor>();
+            for (int i = 0; i < grad_tensor->numel(); ++i) {
+              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
+            }
+          })));
+
+  // 2. forward
+  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+
+  ASSERT_EQ(x->GradVarBase()->GradOpNum(), 0UL);
+  ASSERT_EQ(y->GradVarBase()->GradOpNum(), 0UL);
+  ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
+
+  // 3. backward
+  BasicEngine engine;
+  engine.Init(out.get());
+  engine.Execute();
+
+  framework::LoDTensor x_grad;
+  framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
+                            &x_grad);
+  for (int i = 0; i < x_grad.numel(); ++i) {
+    ASSERT_EQ(x_grad.data<float>()[i], 8.0);
+  }
+
+  framework::LoDTensor y_grad;
+  framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
+                            &y_grad);
+
+  for (int i = 0; i < y_grad.numel(); ++i) {
+    ASSERT_EQ(y_grad.data<float>()[i], 4.0);
+  }
+}
+
+void GradVarLeafBackwardHookWithGradAccmulatedTest() {
+  // 1. prepare
+  Tracer tracer;
+  std::shared_ptr<VarBase> x(new VarBase(true, "x"));
+  std::shared_ptr<VarBase> y(new VarBase(true, "y"));
+  std::shared_ptr<VarBase> z(new VarBase(true, "z"));
+  std::shared_ptr<VarBase> out_xy(new VarBase(true, "out_xy"));
+  std::shared_ptr<VarBase> out_xz(new VarBase(true, "out_xz"));
+  std::shared_ptr<VarBase> out(new VarBase(true, "out"));
+  x->SetOverridedStopGradient(false);
+  y->SetOverridedStopGradient(false);
+  z->SetOverridedStopGradient(false);
+
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> x_dims = {2, 5};
+  std::vector<int64_t> y_dims = {5, 2};
+  std::vector<int64_t> z_dims = {5, 2};
+
+  auto* x_tensor = x->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_tensor = y->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* z_tensor = z->MutableVar()->GetMutable<framework::LoDTensor>();
+
+  x_tensor->Resize(framework::make_ddim(x_dims));
+  auto* mutable_x = x_tensor->mutable_data<float>(place);
+  memory::Copy(place, mutable_x, place, src_data.data(),
+               sizeof(float) * src_data.size());
+
+  y_tensor->Resize(framework::make_ddim(y_dims));
+  auto* mutable_y = y_tensor->mutable_data<float>(place);
+  memory::Copy(place, mutable_y, place, src_data.data(),
+               sizeof(float) * src_data.size());
+
+  z_tensor->Resize(framework::make_ddim(z_dims));
+  auto* mutable_z = z_tensor->mutable_data<float>(place);
+  memory::Copy(place, mutable_z, place, src_data.data(),
+               sizeof(float) * src_data.size());
+
+  // add GradAccumulatorPostHook
+  auto x_var_wrapper = x->SharedVar();
+  x_var_wrapper->AddGradVarLeafBackwardHook(
+      std::unique_ptr<LambdaGradAccumulatorPostHook>(
+          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+            auto* grad_tensor =
+                grad->MutableVar()->GetMutable<framework::LoDTensor>();
+            for (int i = 0; i < grad_tensor->numel(); ++i) {
+              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
+            }
+          })));
+
+  // 2. forward
+  var_pair x_pair = var_pair("X", vb_vector(1, x));
+  var_pair y_pair = var_pair("Y", vb_vector(1, y));
+  var_pair out_xy_pair = var_pair("Out", vb_vector(1, out_xy));
+  NameVarBaseMap ins = {x_pair, y_pair};
+  NameVarBaseMap outs = {out_xy_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+
+  var_pair z_pair = var_pair("Y", vb_vector(1, z));
+  var_pair out_xz_pair = var_pair("Out", vb_vector(1, out_xz));
+  ins = {x_pair, z_pair};
+  outs = {out_xz_pair};
+  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+
+  var_pair xy_pair = var_pair("X", vb_vector(1, out_xy));
+  var_pair xz_pair = var_pair("Y", vb_vector(1, out_xz));
+  var_pair out_pair = var_pair("Out", vb_vector(1, out));
+  ins = {xy_pair, xz_pair};
+  outs = {out_pair};
+  framework::AttributeMap add_attr_map;
+  tracer.TraceOp("elementwise_add", ins, outs, add_attr_map, place, true);
+
+  ASSERT_EQ(x->GradVarBase()->GradOpNum(), 0UL);
+  ASSERT_EQ(y->GradVarBase()->GradOpNum(), 0UL);
+  ASSERT_EQ(z->GradVarBase()->GradOpNum(), 0UL);
+  ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
+
+  // 3. backward
+  BasicEngine engine;
+  engine.Init(out.get());
+  engine.Execute();
+
+  framework::LoDTensor x_grad;
+  framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
+                            &x_grad);
+  for (int i = 0; i < x_grad.numel(); ++i) {
+    ASSERT_EQ(x_grad.data<float>()[i], 16.0);
+  }
+
+  framework::LoDTensor y_grad;
+  framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
+                            &y_grad);
+
+  for (int i = 0; i < y_grad.numel(); ++i) {
+    ASSERT_EQ(y_grad.data<float>()[i], 4.0);
+  }
+
+  framework::LoDTensor z_grad;
+  framework::TensorCopySync(z->GradVar().Get<framework::LoDTensor>(), place,
+                            &z_grad);
+
+  for (int i = 0; i < z_grad.numel(); ++i) {
+    ASSERT_EQ(z_grad.data<float>()[i], 4.0);
+  }
+}
+
+TEST(TestHooks, TestGradVarLeafBackwardHookWithGradAccmulated) {
+  GradVarLeafBackwardHookWithGradAccmulatedTest();
+}
+
+TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
+  FLAGS_sort_sum_gradient = true;
+  GradVarLeafBackwardHookWithGradAccmulatedTest();
+  FLAGS_sort_sum_gradient = false;
+}
+
+}  // namespace imperative
+}  // namespace paddle
+
+USE_OP(mul);
+USE_OP(mul_grad);
+USE_OP(elementwise_add);
+USE_OP(elementwise_add_grad);
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index d730ddc12d1053..e9b1ccc860df0f 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -16,11 +16,16 @@
 
 #include <memory>
 #include <string>
+#include <utility>
+
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/hooks.h"
 
 namespace paddle {
 namespace imperative {
 
+class InteriorVarHookPipeline;
+class LeafVarHookPipeline;
 class VarBase;
 class GradOpNode;
 
@@ -133,6 +138,42 @@ class VariableWrapper {
     }
   }
 
+  /* Hook related method: only can be call by GradVarBase */
+
+  bool HasInteriorHooks() const { return interior_hooks_ != nullptr; }
+
+  bool HasLeafHooks() const { return leaf_hooks_ != nullptr; }
+
+  void AddGradVarInteriorHook(std::unique_ptr<OpBasePreHook>&& hook) {
+    auto interior_hooks = GetGradVarInteriorHooksSafely();
+    interior_hooks->add_hook(std::move(hook));
+  }
+
+  void AddGradVarLeafHook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
+    auto leaf_hooks = GetGradVarLeafHooksSafely();
+    leaf_hooks->add_hook(std::move(hook));
+  }
+
+  void AddGradVarLeafBackwardHook(
+      std::unique_ptr<GradAccumulatorPostHook>&& hook) {
+    auto leaf_hooks = GetGradVarLeafHooksSafely();
+    leaf_hooks->add_backward_hook(std::move(hook));
+  }
+
+  const std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() const {
+    return interior_hooks_;
+  }
+
+  std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() {
+    return interior_hooks_;
+  }
+
+  const std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() const {
+    return leaf_hooks_;
+  }
+
+  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
@@ -159,6 +200,41 @@ class VariableWrapper {
     }
   }
 
+  /* Hook related private methods */
+  std::shared_ptr<VariableWrapper> GetGradVarSafely() const {
+    auto shared_grad_var = grad_var_.lock();
+    PADDLE_ENFORCE_NOT_NULL(
+        shared_grad_var,
+        platform::errors::PermissionDenied(
+            "Cannot add gradient hook on Tensor without gradient."));
+    return shared_grad_var;
+  }
+
+  std::shared_ptr<InteriorVarHookPipeline>& GetGradVarInteriorHooksSafely() {
+    auto shared_grad_var = GetGradVarSafely();
+    PADDLE_ENFORCE_EQ(HasGradNode(), true,
+                      platform::errors::PermissionDenied(
+                          "Only interior Tensor in backward can register "
+                          "interior gradient hook."));
+    if (shared_grad_var->interior_hooks_ == nullptr) {
+      shared_grad_var->interior_hooks_ =
+          std::make_shared<InteriorVarHookPipeline>();
+    }
+    return shared_grad_var->interior_hooks_;
+  }
+
+  std::shared_ptr<LeafVarHookPipeline>& GetGradVarLeafHooksSafely() {
+    auto shared_grad_var = GetGradVarSafely();
+    PADDLE_ENFORCE_EQ(
+        HasGradNode(), false,
+        platform::errors::PermissionDenied(
+            "Only leaf Tensor in backward can register leaf gradient hook."));
+    if (shared_grad_var->leaf_hooks_ == nullptr) {
+      shared_grad_var->leaf_hooks_ = std::make_shared<LeafVarHookPipeline>();
+    }
+    return shared_grad_var->leaf_hooks_;
+  }
+
  private:
   framework::Variable var_;
   std::string name_;
@@ -173,6 +249,12 @@ class VariableWrapper {
 
   std::weak_ptr<VariableWrapper> grad_var_;
   std::weak_ptr<GradOpNode> grad_node_;
+
+  // NOTE: only grad var can hold hooks now
+  // only interior var can hold interior hooks
+  std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
+  // only leaf var can hold leaf hooks
+  std::shared_ptr<LeafVarHookPipeline> leaf_hooks_;
 };
 
 }  // namespace imperative

From db2e6cee620113590ef6f29eb65c495b7bab2d19 Mon Sep 17 00:00:00 2001
From: Shibo Tao <62922815+T8T9@users.noreply.github.com>
Date: Wed, 18 Nov 2020 14:05:10 +0800
Subject: [PATCH 47/56] add two paddle-2.0 apis:
 paddle.static.io.save_inference_model and
 paddle.static.io.load_inference_model (#28606)

* add two apis: paddle.static.io.save_inference_model and paddle.static.io.load_inference_mode, which are campatible with paddle.fluid.io.save_inference_model and paddle.fluid.io.load_inference_model respectively.

* add unittest for new save_inference_model and load_inference_model. test=develop

* enhance doc. test=develop

* add paddle.enable_static() to test_inference_model_io.py. test=develop
---
 python/paddle/fluid/io.py                     |  61 +++-
 .../tests/unittests/rnn/test_rnn_nets.py      |   8 +-
 .../unittests/test_inference_model_io.py      | 116 +++++-
 python/paddle/static/__init__.py              |   4 +-
 python/paddle/static/io.py                    | 335 ++++++++++++++++++
 5 files changed, 507 insertions(+), 17 deletions(-)
 create mode 100644 python/paddle/static/io.py

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index bb55aeb70d1f2d..29a6dcb13551a7 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -43,6 +43,8 @@
 from .dataloader import *
 from . import core
 from .. import compat as cpt
+from paddle.utils import deprecated
+from paddle.fluid.framework import static_only
 
 batch = paddle.batch
 
@@ -82,7 +84,10 @@ def is_parameter(var):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             param = fluid.default_main_program().global_block().var('fc.w')
             res = fluid.io.is_parameter(param)
     """
@@ -103,7 +108,10 @@ def is_persistable(var):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             param = fluid.default_main_program().global_block().var('fc.b')
             res = fluid.io.is_persistable(param)
     """
@@ -137,7 +145,10 @@ def get_program_parameter(program):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             data = fluid.data(name="img", shape=[64, 784])
             w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
             b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
@@ -162,7 +173,10 @@ def get_program_persistable_vars(program):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             data = fluid.data(name="img", shape=[64, 784])
             w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
             b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
@@ -202,7 +216,7 @@ def _load_program_scope(main=None, startup=None, scope=None):
                     yield
 
 
-def _get_valid_program(main_program):
+def _get_valid_program(main_program=None):
     if main_program is None:
         main_program = default_main_program()
     elif isinstance(main_program, CompiledProgram):
@@ -268,8 +282,10 @@ def save_vars(executor,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             main_prog = fluid.Program()
             startup_prog = fluid.Program()
             with fluid.program_guard(main_prog, startup_prog):
@@ -417,8 +433,11 @@ def save_params(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+
+            paddle.enable_static()
             params_path = "./my_paddle_model"
             image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
@@ -465,7 +484,10 @@ def _save_distributed_persistables(executor, dirname, main_program):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             t = distribute_transpiler.DistributeTranspiler()
@@ -634,8 +656,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             dir_path = "./my_paddle_model"
             file_name = "persistables"
             image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
@@ -711,8 +735,10 @@ def load_vars(executor,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             main_prog = fluid.Program()
             startup_prog = fluid.Program()
             with fluid.program_guard(main_prog, startup_prog):
@@ -946,8 +972,10 @@ def load_params(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
@@ -995,8 +1023,10 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
@@ -1034,7 +1064,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             t = distribute_transpiler.DistributeTranspiler()
@@ -1160,7 +1193,8 @@ def append_fetch_ops(inference_program,
             attrs={'col': i})
 
 
-@dygraph_not_support
+@static_only
+@deprecated(since="2.0.0", update_to="paddle.static.save_inference_model")
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
@@ -1226,8 +1260,10 @@ def save_inference_model(dirname,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             path = "./infer_model"
 
             # User defined network, here a softmax regession example
@@ -1370,7 +1406,8 @@ def save_inference_model(dirname,
     return target_var_name_list
 
 
-@dygraph_not_support
+@static_only
+@deprecated(since="2.0.0", update_to="paddle.static.load_inference_model")
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
@@ -1422,9 +1459,11 @@ def load_inference_model(dirname,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
             import numpy as np
 
+            paddle.enable_static()
             # Build the model
             main_prog = fluid.Program()
             startup_prog = fluid.Program()
@@ -1540,7 +1579,10 @@ def get_parameter_value(para, executor):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             param = fluid.default_main_program().global_block().var('fc.w')
             p = fluid.io.get_parameter_value(param, exe)
@@ -1578,7 +1620,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             exe = fluid.Executor(fluid.CPUPlace())
             p = fluid.io.get_parameter_value('fc.w', exe)
     """
@@ -1686,8 +1731,10 @@ def save(program, model_path):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             prog = fluid.default_main_program()
             fluid.save( prog, "./temp")
 
@@ -1753,8 +1800,10 @@ def load(program, model_path, executor=None, var_list=None):
      Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
             prog = fluid.default_main_program()
             fluid.save( prog, "./temp")
 
@@ -1914,7 +1963,10 @@ def load_program_state(model_path, var_list=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             x = fluid.data( name="x", shape=[10, 10], dtype='float32')
             y = fluid.layers.fc( x, 10)
             z = fluid.layers.fc( y, 10)
@@ -2047,7 +2099,10 @@ def set_program_state(program, state_dict):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+
+            paddle.enable_static()
             x = fluid.data( name="x", shape=[10, 10], dtype='float32')
             y = fluid.layers.fc( x, 10)
             z = fluid.layers.fc( y, 10)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 87bdee8a91d21b..639605a64ed289 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -323,10 +323,7 @@ def forward(self, input):
         exe = paddle.static.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = paddle.static.load_inference_model(
-             dirname="./inference",
-             executor=exe,
-             model_filename="%s_infer.pdmodel" % mode,
-             params_filename="%s_infer.pdiparams" % mode)
+             "./inference/%s_infer" % mode, exe)
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: x.numpy()},
                           fetch_list=fetch_targets)
@@ -345,3 +342,6 @@ def load_tests(loader, tests, pattern):
                 for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
                     suite.addTest(test_class(time_major, direction, device))
     return suite
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index aa408aedf66e16..a82bc3f0f62028 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -23,6 +23,7 @@
 import paddle.fluid as fluid
 import warnings
 
+import paddle
 import paddle.fluid.executor as executor
 import paddle.fluid.layers as layers
 import paddle.fluid.optimizer as optimizer
@@ -30,15 +31,17 @@
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.io import save_inference_model, load_inference_model, save_persistables
 from paddle.fluid.transpiler import memory_optimize
+paddle.enable_static()
 
 
-class TestBook(unittest.TestCase):
-    class InferModel(object):
-        def __init__(self, list):
-            self.program = list[0]
-            self.feed_var_names = list[1]
-            self.fetch_vars = list[2]
+class InferModel(object):
+    def __init__(self, list):
+        self.program = list[0]
+        self.feed_var_names = list[1]
+        self.fetch_vars = list[2]
+
 
+class TestBook(unittest.TestCase):
     def test_fit_line_inference_model(self):
         MODEL_DIR = "./tmp/inference_model"
         UNI_MODEL_DIR = "./tmp/inference_model1"
@@ -88,10 +91,10 @@ def test_fit_line_inference_model(self):
 
         six.moves.reload_module(executor)  # reload to build a new scope
 
-        model_0 = self.InferModel(load_inference_model(MODEL_DIR, exe))
+        model_0 = InferModel(load_inference_model(MODEL_DIR, exe))
         with open(os.path.join(UNI_MODEL_DIR, 'model'), "rb") as f:
             model_str = f.read()
-        model_1 = self.InferModel(
+        model_1 = InferModel(
             load_inference_model(None, exe, model_str, params_str))
 
         for model in [model_0, model_1]:
@@ -192,6 +195,103 @@ def test_save_inference_model(self):
                           [MODEL_DIR, ["x", "y"], [avg_cost], [], cp_prog])
 
 
+class TestSaveInferenceModelNew(unittest.TestCase):
+    def test_save_and_load_inference_model(self):
+        MODEL_DIR = "./tmp/inference_model5"
+        init_program = fluid.default_startup_program()
+        program = fluid.default_main_program()
+
+        # fake program without feed/fetch
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+
+            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost, init_program)
+
+        place = core.CPUPlace()
+        exe = executor.Executor(place)
+        exe.run(init_program, feed={}, fetch_list=[])
+
+        tensor_x = np.array([[1, 1], [1, 2], [5, 2]]).astype("float32")
+        tensor_y = np.array([[-2], [-3], [-7]]).astype("float32")
+        for i in six.moves.xrange(3):
+            exe.run(program,
+                    feed={'x': tensor_x,
+                          'y': tensor_y},
+                    fetch_list=[avg_cost])
+
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                None, ['x', 'y'], [avg_cost], exe)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR + "/", [x, y], [avg_cost], exe)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR, ['x', 'y'], [avg_cost], exe)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR, 'x', [avg_cost], exe)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR, [x, y], ['avg_cost'], exe)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR, [x, y], 'avg_cost', exe)
+
+        model_path = MODEL_DIR + "_isdir.pdmodel"
+        os.makedirs(model_path)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR + "_isdir", [x, y], [avg_cost], exe)
+        os.rmdir(model_path)
+
+        params_path = MODEL_DIR + "_isdir.pdmodel"
+        os.makedirs(params_path)
+        self.assertRaises(ValueError, paddle.static.save_inference_model,
+                MODEL_DIR + "_isdir", [x, y], [avg_cost], exe)
+        os.rmdir(params_path)
+
+        paddle.static.io.save_inference_model(MODEL_DIR, [x, y], [avg_cost], exe)
+
+        self.assertTrue(os.path.exists(MODEL_DIR + ".pdmodel"))
+        self.assertTrue(os.path.exists(MODEL_DIR + ".pdiparams"))
+
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
+
+        six.moves.reload_module(executor)  # reload to build a new scope
+
+        self.assertRaises(ValueError, paddle.static.load_inference_model,
+                None, exe)
+        self.assertRaises(ValueError, paddle.static.load_inference_model,
+                MODEL_DIR + "/", exe)
+        self.assertRaises(ValueError, paddle.static.load_inference_model,
+                [MODEL_DIR], exe)
+        self.assertRaises(ValueError, paddle.static.load_inference_model,
+                MODEL_DIR, exe, pserver_endpoints=None)
+        self.assertRaises(ValueError, paddle.static.load_inference_model,
+                MODEL_DIR, exe, unsupported_param=None)
+        self.assertRaises((TypeError, ValueError), paddle.static.load_inference_model,
+                None, exe, model_filename="illegal", params_filename="illegal")
+
+        model = InferModel(paddle.static.io.load_inference_model(MODEL_DIR, exe))
+
+        outs = exe.run(model.program,
+                       feed={
+                           model.feed_var_names[0]: tensor_x,
+                           model.feed_var_names[1]: tensor_y
+                       },
+                       fetch_list=model.fetch_vars)
+        actual = outs[0]
+
+        self.assertEqual(model.feed_var_names, ["x", "y"])
+        self.assertEqual(len(model.fetch_vars), 1)
+        self.assertEqual(expected, actual)
+
+
+
 class TestLoadInferenceModelError(unittest.TestCase):
     def test_load_model_not_exist(self):
         place = core.CPUPlace()
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index a6ce4379824f07..bca045852fd06e 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -23,6 +23,8 @@
 ]
 
 from . import nn
+from .io import save_inference_model
+from .io import load_inference_model
 from ..fluid import Scope  #DEFINE_ALIAS
 from .input import data  #DEFINE_ALIAS
 from .input import InputSpec  #DEFINE_ALIAS
@@ -48,8 +50,6 @@
 from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
 from ..fluid.io import save  #DEFINE_ALIAS
 from ..fluid.io import load  #DEFINE_ALIAS
-from ..fluid.io import save_inference_model  #DEFINE_ALIAS
-from ..fluid.io import load_inference_model  #DEFINE_ALIAS
 from ..fluid.io import load_program_state  #DEFINE_ALIAS
 from ..fluid.io import set_program_state  #DEFINE_ALIAS
 from ..fluid.layers import create_parameter  #DEFINE_ALIAS
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
new file mode 100644
index 00000000000000..b30dfa8429fd97
--- /dev/null
+++ b/python/paddle/static/io.py
@@ -0,0 +1,335 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+
+import errno
+import inspect
+import logging
+import os
+import six
+
+import paddle
+from paddle.fluid import core, Variable, CompiledProgram, program_guard, default_main_program, Program
+from paddle.fluid.framework import static_only
+from paddle.fluid import layers
+
+from paddle.fluid.io import _get_valid_program, save_vars, _save_distributed_persistables
+from paddle.fluid.io import prepend_feed_ops, append_fetch_ops, save_persistables
+from paddle.fluid.io import load_persistables, _endpoints_replacement
+from paddle.fluid.log_helper import get_logger
+
+
+__all__ = [
+    'save_inference_model',
+    'load_inference_model',
+]
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def _check_args(caller, args, supported_args=[], deprecated_args=[]):
+    for arg in args:
+        if arg in deprecated_args:
+            raise ValueError("argument '{}' in function '{}' is deprecated, only {} are supported.".format(arg, caller, supported_args))
+        elif arg not in supported_args:
+            raise ValueError(
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(caller, arg, supported_args))
+
+
+@static_only
+def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
+    """
+    :api_attr: Static Graph
+
+    Save current model and its parameters to given path. i.e.
+    Given path_prefix = "/path/to/modelname", after invoking
+    save_inference_model(path_prefix, feed_vars, fetch_vars, executor),
+    you will find two files named modelname.pdmodel and modelname.pdiparams
+    under "/path/to", which represent your model and parameters respectively.
+
+    Args:
+        path_prefix(str): Directory path to save model + model name without suffix.
+        feed_vars(Variable | list[Variable]): Variables needed by inference.
+        fetch_vars(Variable | list[Variable]): Variables returned by inference.
+        executor(Executor): The executor that saves the inference model. You can refer
+                            to :ref:`api_guide_executor_en` for more details.
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
+        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+
+            paddle.enable_static()
+
+            path_prefix = "./infer_model"
+
+            # User defined network, here a softmax regession example
+            image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace())
+            predict = fluid.layers.fc(input=image, size=10, act='softmax')
+
+            loss = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_loss = fluid.layers.mean(loss)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+
+            # Feed data and train process
+
+            # Save inference model. Note we don't save label and loss in this example
+            paddle.static.io.save_inference_model(path_prefix, [image], [predict], exe)
+
+            # In this example, the save_inference_mode inference will prune the default
+            # main program according to the network's input node (img) and output node(predict).
+            # The pruned inference program is going to be saved in file "./infer_model.pdmodel"
+            # and parameters are going to be saved in file "./infer_model.pdiparams".
+
+    """
+    # check path_prefix, set model_path and params_path
+    if not isinstance(path_prefix, six.string_types):
+        raise ValueError("'path_prefix' should be a string.")
+    if path_prefix.endswith("/"):
+        raise ValueError("'path_prefix' should not be a directory")
+    path_prefix = os.path.normpath(path_prefix)
+    path_prefix = os.path.abspath(path_prefix)
+    try:
+        # mkdir may conflict if pserver and trainer are running on the same machine
+        dirname = os.path.dirname(path_prefix)
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+    model_path = path_prefix + ".pdmodel"
+    params_path = path_prefix + ".pdiparams"
+    if os.path.isdir(model_path):
+        raise ValueError("'{}' is an existing directory.".format(model_path))
+    if os.path.isdir(params_path):
+        raise ValueError("'{}' is an existing directory.".format(params_path))
+
+    # verify feed_vars
+    if not isinstance(feed_vars, list):
+        feed_vars = [feed_vars]
+    if not feed_vars or not all([isinstance(var, Variable) for var in feed_vars]):
+        raise ValueError("'feed_vars' should be a Variable or a list of Variable.")
+
+    # verify fetch_vars
+    if not isinstance(fetch_vars, list):
+        fetch_vars = [fetch_vars]
+    if not fetch_vars or not all([isinstance(var, Variable) for var in fetch_vars]):
+        raise ValueError("'fetch_vars' should be a Variable or a list of Variable.")
+
+    main_program = _get_valid_program()
+    # remind users to set auc_states to 0 if auc op were found.
+    for op in main_program.global_block().ops:
+        # clear device of Op
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        op._set_attr(device_attr_name, "")
+        if op.type == 'auc':
+            warnings.warn("Be sure that you have set auc states to 0 before saving inference model.")
+            break
+
+    # fix the bug that the activation op's output as target will be pruned.
+    # will affect the inference performance.
+    # TODO(Superjomn) add an IR pass to remove 1-scale op.
+    with program_guard(main_program):
+        uniq_fetch_vars = []
+        for i, var in enumerate(fetch_vars):
+            var = layers.scale(var, 1., name="save_infer_model/scale_{}".format(i))
+            uniq_fetch_vars.append(var)
+        fetch_vars = uniq_fetch_vars
+    
+    # save model
+    origin_program = main_program.clone()
+    main_program = main_program.clone()
+    global_block = main_program.global_block()
+    remove_op_idx = []
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            remove_op_idx.append(i)
+    for idx in remove_op_idx[::-1]:
+        global_block._remove_op(idx)
+    main_program.desc.flush()
+
+    feed_var_names = [var.name for var in feed_vars]
+    main_program = main_program._prune_with_input(
+        feeded_var_names=feed_var_names, targets=fetch_vars)
+    main_program = main_program._inference_optimize(prune_read_op=True)
+    fetch_var_names = [var.name for var in fetch_vars]
+    prepend_feed_ops(main_program, feed_var_names)
+    append_fetch_ops(main_program, fetch_var_names)
+    main_program.desc._set_version()
+    paddle.fluid.core.save_op_version_info(main_program.desc)
+    with open(model_path, "wb") as f:
+        f.write(main_program.desc.serialize_to_string())
+    main_program._copy_dist_param_info_from(origin_program)
+
+    # save params
+    dirname = os.path.dirname(params_path)
+    basename = os.path.basename(params_path)
+    save_persistables(executor, dirname, main_program, basename)
+
+
+@static_only
+def load_inference_model(path_prefix, executor, **configs):
+    """
+    :api_attr: Static Graph
+
+    Load inference model from a given path. By this API, you can get the model
+    structure(Inference Program) and model parameters.
+
+    Args:
+        path_prefix(str | None): One of the following:
+          - Directory path to save model + model name without suffix.
+          - Set to None when reading the model from memory.
+        executor(Executor): The executor to run for loading inference model.
+                            See :ref:`api_guide_executor_en` for more details about it.
+
+    Returns:
+        list: The return of this API is a list with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a
+        ``Program`` (refer to :ref:`api_guide_Program_en`), which is used for inference.
+        The `feed_target_names` is a list of ``str``, which contains names of variables
+        that need to feed data in the inference program. The `fetch_targets` is a list of
+        ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which
+        we can get inference results.
+
+    Raises:
+        ValueError: If `path_prefix.pdmodel` or `path_prefix.pdiparams`  doesn't exist.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+
+            paddle.enable_static()
+
+            # Build the model
+            startup_prog = fluid.default_startup_program()
+            main_prog = fluid.default_main_program()
+            with fluid.program_guard(main_prog, startup_prog):
+                image = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+                b = fluid.layers.create_parameter(shape=[200], dtype='float32')
+                hidden_w = fluid.layers.matmul(x=image, y=w)
+                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            # Save the inference model
+            path_prefix = "./infer_model"
+            paddle.static.io.save_inference_model(path_prefix, [image], [hidden_b], exe)
+
+            [inference_program, feed_target_names, fetch_targets] = (
+                paddle.static.io.load_inference_model(path_prefix, exe))
+            tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32)
+            results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+            # In this example, the inference program was saved in file
+            # "./infer_model.pdmodel" and parameters were saved in file
+            # " ./infer_model.pdiparams".
+            # By the inference program, feed_target_names and
+            # fetch_targets, we can use an executor to run the inference
+            # program to get the inference result.
+    """
+    # check configs
+    supported_args = ('model_filename', 'params_filename')
+    deprecated_args = ('pserver_endpoints',)
+    caller = inspect.currentframe().f_code.co_name
+    _check_args(caller, configs, supported_args, deprecated_args)
+
+    # load from memory
+    if path_prefix is None:
+        _logger.warning("Load inference model from memory is deprecated.")
+        model_filename = configs.get('model_filename', None)
+        params_filename = configs.get('params_filename', None)
+        if params_filename is None:
+            raise ValueError(
+                "params_filename cannot be None when path_prefix is None."
+            )
+        load_dirname = path_prefix
+        program_desc_str = model_filename
+        params_filename = params_filename
+    # load from file
+    else:
+        # check and norm path_prefix
+        if not isinstance(path_prefix, six.string_types):
+            raise ValueError("'path_prefix' should be a string.")
+        if path_prefix.endswith("/"):
+            raise ValueError("'path_prefix' should not be a directory")
+        path_prefix = os.path.normpath(path_prefix)
+        path_prefix = os.path.abspath(path_prefix)
+
+        # set model_path and params_path in new way,
+        # path_prefix represents a file path without suffix in this case.
+        if not configs:
+            model_path = path_prefix + ".pdmodel"
+            params_path = path_prefix + ".pdiparams"
+        # set model_path and params_path in old way for compatible,
+        # path_prefix represents a directory path.
+        else:
+            model_filename = configs.get('model_filename', None)
+            params_filename = configs.get('params_filename', None)
+            # set model_path
+            if model_filename is None:
+                model_path = os.path.join(path_prefix, "__model__")
+            else:
+                model_path = os.path.join(path_prefix, model_filename + ".pdmodel")
+                if not os.path.exists(model_path):
+                    model_path = os.path.join(path_prefix, model_filename)
+            # set params_path
+            if params_filename is None:
+                params_path = os.path.join(path_prefix, "")
+            else:
+                params_path = os.path.join(path_prefix, params_filename + ".pdiparams")
+                if not os.path.exists(params_path):
+                    params_path = os.path.join(path_prefix, params_filename)
+            _logger.warning("The old way to load inference model is deprecated."
+                    " model path: {}, params path: {}".format(model_path, params_path))
+        with open(model_path, "rb") as f:
+            program_desc_str = f.read()
+        load_dirname = os.path.dirname(params_path)
+        params_filename = os.path.basename(params_path)
+
+    program = Program.parse_from_string(program_desc_str)
+    if not core._is_program_version_supported(program._version()):
+        raise ValueError("Unsupported program version: %d\n" %
+                         program._version())
+    # Binary data also need versioning.
+    load_persistables(executor, load_dirname, program, params_filename)
+
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]
+

From 532e4bbf2a62e98fadce4eda6b9f07f235b40399 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 18 Nov 2020 15:04:14 +0800
Subject: [PATCH 48/56] fix docs (#28683)

---
 python/paddle/nn/functional/conv.py           |  3 +--
 python/paddle/nn/functional/loss.py           |  2 --
 python/paddle/nn/layer/loss.py                |  2 --
 python/paddle/utils/download.py               |  2 +-
 python/paddle/vision/datasets/folder.py       |  2 +-
 python/paddle/vision/transforms/functional.py | 10 +++-------
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 6df1ce368c1b0b..1b0441b0a8cca4 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -211,7 +211,7 @@ def conv1d(x,
            [[0, 3, 4],
             [2, 9, 7],
             [5, 6, 8]]]).astype(np.float32)
-          paddle.disable_static()
+          
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
           y_var = F.conv1d(x_var, w_var)
@@ -673,7 +673,6 @@ def conv1d_transpose(x,
           import paddle.nn.functional as F
           import numpy as np
           
-          paddle.disable_static()
           # shape: (1, 2, 4)
           x=np.array([[[4, 0, 9, 7],
                        [8, 0, 9, 2,]]]).astype(np.float32)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ae04cdcc931eca..fa0789b762041f 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -895,8 +895,6 @@ def kl_div(input, label, reduction='mean', name=None):
             import numpy as np
             import paddle.nn.functional as F
 
-            paddle.disable_static()
-
             shape = (5, 20)
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index fdeed0ae49dfd6..b16dcae7b63292 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -773,8 +773,6 @@ class KLDivLoss(fluid.dygraph.Layer):
             import numpy as np
             import paddle.nn as nn
 
-            paddle.disable_static()
-
             shape = (5, 20)
             x = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index d8c0a2fc8c2845..7ba208574353fa 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -123,7 +123,7 @@ def get_weights_path_from_url(url, md5sum=None):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.download import get_weights_path_from_url
+            from paddle.utils.download import get_weights_path_from_url
 
             resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
             local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index d005bc4f19ebb8..8b17da9c9236bb 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -306,7 +306,7 @@ def __getitem__(self, index):
             index (int): Index
 
         Returns:
-            tuple: (sample, target) where target is class_index of the target class.
+            sample of specific index.
         """
         path = self.samples[index]
         sample = self.loader(path)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 7391ae322e3598..67dff85f57014b 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -39,7 +39,7 @@
 __all__ = [
     'to_tensor', 'hflip', 'vflip', 'resize', 'pad', 'rotate', 'to_grayscale',
     'crop', 'center_crop', 'adjust_brightness', 'adjust_contrast', 'adjust_hue',
-    'to_grayscale', 'normalize'
+    'normalize'
 ]
 
 
@@ -283,13 +283,11 @@ def center_crop(img, output_size):
         return F_cv2.center_crop(img, output_size)
 
 
-def hflip(img, backend='pil'):
+def hflip(img):
     """Horizontally flips the given Image or np.array.
 
     Args:
         img (PIL.Image|np.array): Image to be flipped.
-        backend (str, optional): The image proccess backend type. Options are `pil`, 
-            `cv2`. Default: 'pil'. 
 
     Returns:
         PIL.Image or np.array:  Horizontall flipped image.
@@ -576,8 +574,6 @@ def to_grayscale(img, num_output_channels=1):
 
     Args:
         img (PIL.Image|np.array): Image to be converted to grayscale.
-        backend (str, optional): The image proccess backend type. Options are `pil`, 
-                    `cv2`. Default: 'pil'. 
 
     Returns:
         PIL.Image or np.array: Grayscale version of the image.
@@ -624,7 +620,7 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
             this option will be igored. Default: False.
 
     Returns:
-        Tensor: Normalized mage. Data format is same as input img.
+        np.ndarray or Tensor: Normalized mage. Data format is same as input img.
     
     Examples:
         .. code-block:: python

From 01a14e1be209b3300be1f36a27152cfd429533a4 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 18 Nov 2020 15:26:50 +0800
Subject: [PATCH 49/56] Add with_pool args for vgg (#28684)

* add arg for vgg
---
 python/paddle/vision/models/resnet.py |  2 +-
 python/paddle/vision/models/vgg.py    | 42 +++++++++++++++++----------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index 8cf797f1719e99..1f44e0bc6dfeb1 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -245,7 +245,7 @@ def forward(self, x):
         x = self.layer3(x)
         x = self.layer4(x)
 
-        if self.with_pool > 0:
+        if self.with_pool:
             x = self.avgpool(x)
 
         if self.num_classes > 0:
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 00f6cccbdfe9f1..f6b4c75e84f013 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -36,9 +36,10 @@ class VGG(nn.Layer):
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
     Args:
-        features (nn.Layer): vgg features create by function make_layers.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+        features (nn.Layer): Vgg features create by function make_layers.
+        num_classes (int): Output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
+        with_pool (bool): Use pool before the last three fc layer or not. Default: True.
 
     Examples:
         .. code-block:: python
@@ -54,24 +55,35 @@ class VGG(nn.Layer):
 
     """
 
-    def __init__(self, features, num_classes=1000):
+    def __init__(self, features, num_classes=1000, with_pool=True):
         super(VGG, self).__init__()
         self.features = features
-        self.avgpool = nn.AdaptiveAvgPool2D((7, 7))
-        self.classifier = nn.Sequential(
-            nn.Linear(512 * 7 * 7, 4096),
-            nn.ReLU(),
-            nn.Dropout(),
-            nn.Linear(4096, 4096),
-            nn.ReLU(),
-            nn.Dropout(),
-            nn.Linear(4096, num_classes), )
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D((7, 7))
+
+        if num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes), )
 
     def forward(self, x):
         x = self.features(x)
-        x = self.avgpool(x)
-        x = paddle.flatten(x, 1)
-        x = self.classifier(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+
         return x
 
 

From 628fb29c1b5c3c791f39e6bd906b28fbce61a6dd Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 18 Nov 2020 16:44:10 +0800
Subject: [PATCH 50/56] modified the sys adress of quickly disable file
 (#28660)

---
 tools/get_quick_disable_lt.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 9b41f5e78085e5..1e3d7178922728 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -20,7 +20,11 @@
 def download_file():
     """Get disabled unit tests"""
     ssl._create_default_https_context = ssl._create_unverified_context
-    url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
+    sysstr=sys.platform
+    if sysstr == 'win32':
+        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
+    else:
+        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
     f = requests.get(url)
     data = f.text
     status_code = f.status_code

From e880c90c5a091e4a331b09b43987142b02f61ac1 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 18 Nov 2020 16:44:31 +0800
Subject: [PATCH 51/56] fix error when setting ut timeout value (#28696)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6e78f7d90149e2..e527ba613ba554 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -621,11 +621,14 @@ endif()
 if (WITH_DISTRIBUTE)
     set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
 endif()
+
 if (WITH_DISTRIBUTE AND NOT APPLE)
+    if(WITH_GPU)
+        set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_launch PROPERTIES TIMEOUT 120)
+    endif()
     set_tests_properties(test_fleet_launch PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_launch PROPERTIES TIMEOUT 120)
 endif()
 
 # setting timeout value as 15S

From 8c75b2554aeb591d5696083258ec3b89edcf5b8b Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Wed, 18 Nov 2020 18:59:33 +0800
Subject: [PATCH 52/56] Support Tensor for attr_scale and attr_size (#28677)

* update interpolate, test=develop

* fix coverage, test=develop
---
 .../unittests/test_bilinear_interp_v2_op.py   | 63 +++++++++++++++++++
 python/paddle/nn/functional/common.py         | 18 ++++--
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index 58312979c523bd..2ff32b2f95bb44 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -623,5 +623,68 @@ def test_case(self):
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
+class TestBilinearInterpOpAPI_dy2(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            size_np = np.array([12, 12]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            size = paddle.to_tensor(size_np)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x, size=size, mode="bilinear", align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+class TestBilinearInterpOpAPI_dy3(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            size_1 = np.array([12]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            size = paddle.to_tensor(size_1)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                size=[size, size],
+                mode="bilinear",
+                align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+class TestBilinearInterpOpAPI_dy4(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            scale_np = np.array([2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale,
+                mode="bilinear",
+                align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5c5e3f37916da1..e4f145cf4234fc 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -366,10 +366,18 @@ def _is_list_or_turple_(data):
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
-        if isinstance(out_shape, Variable):
+
+        if isinstance(out_shape, Variable) and not in_dygraph_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
+
         else:
+            if in_dygraph_mode():
+                if isinstance(out_shape, Variable):
+                    out_shape = list(out_shape.numpy())
+                for i, dim in enumerate(out_shape):
+                    if isinstance(dim, Variable):
+                        out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError("size should be a list or tuple or Variable.")
             # Validate the shape
@@ -435,6 +443,8 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[2]
 
     else:
+        if in_dygraph_mode() and isinstance(scale, Variable):
+            scale = list(scale.numpy())
         if isinstance(scale, Variable):
             scale.stop_gradient = True
             inputs["Scale"] = scale
@@ -1240,7 +1250,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-
+            
             # example 2
             x_shape = (1, 1, 2, 3)
             x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1
@@ -1364,7 +1374,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
     Examples:
         .. code-block:: text
-        
+
             Case 0:
                 x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
                      [0.48949873 0.5797396  0.65444374 0.66510963]
@@ -1380,7 +1390,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
     Code Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn as nn
             import numpy as np

From 5a9f6889c19eeea06733632b676d233691b2cff1 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com>
Date: Wed, 18 Nov 2020 19:38:02 +0800
Subject: [PATCH 53/56] [Sharding] add new features (#28568)

* add lars to fleet meta optimizer

* add lamb to proto

* add lamb to fleet meta optimizer

* fixed syntax bug

* fixed syntax bug

* fixed syntax error in lamb, add config setter of lamb in distributed_strategy

* trigger unitest to rerun

* add new unitest func for lamb

* revise unitest for lars and lamb

* revise dgc meta unitest

* revise lars document in distribute_strategy

* revise lars lamb document in distributed_strategy.py

* revise lars lamb document in distributed_strategy.py

* add weight decay exclude logic to lars

* restore optimzier.py

* restore optimizer.py as develop except lars

* add epsilon and exclude fn to distributed_sttrategy

* add lars epsilon

* revise unitest for fleet lars and lamb

* revise lars lamb unitest for CI coverage

* revise lars argument api

* revise lars argument api

* revise lars argument api

* revise api doc of lars

* fix op role

* add sharding save and add_sync_comm_for_test function

* add comm_analyse to utlis

* revise sharding_utils

* add sharding saving unittest

* revise sharding utils for unittest
---
 .../fleet/meta_optimizers/sharding/utils.py   | 139 +++++++++++++++++-
 .../tests/unittests/dist_sharding_save.py     |  90 ++++++++++++
 .../unittests/test_dist_sharding_save.py      |  79 ++++++++++
 .../test_fleet_sharding_meta_optimizer.py     |  22 +++
 4 files changed, 324 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
 create mode 100755 python/paddle/fluid/tests/unittests/dist_sharding_save.py
 create mode 100755 python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py

diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
old mode 100644
new mode 100755
index 51435ebb9e5e95..2aa4bdd68c9907
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -11,13 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import paddle
 from paddle.fluid import core
 from functools import reduce
 from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
 
 import re
+import os
 
 
 def check_broadcast(block):
@@ -126,11 +127,25 @@ def check_allreduce_sum(block):
     return
 
 
+def get_valid_op_role(block, insert_idx):
+    """
+    return OpRole.Forward or OpRole.Backward
+    """
+    op_role = block.ops[insert_idx].attr('op_role')
+    if (insert_idx >= len(block.ops)) or (
+            op_role in [int(OpRole.Backward), int(OpRole.Optimize)]):
+        return OpRole.Backward
+    if op_role in [int(OpRole.Forward), int(OpRole.Loss)]:
+        return OpRole.Forward
+
+    return get_valid_op_role(block, insert_idx + 1)
+
+
 def insert_sync_calc_op(block, insert_idx, calc_dep_vars):
     """
     _insert_sync_calc_op
     """
-    op_role = block.ops[insert_idx].attr('op_role')
+    op_role = get_valid_op_role(block, insert_idx)
     block._insert_op_without_sync(
         insert_idx,
         type='c_sync_calc_stream',
@@ -144,7 +159,7 @@ def insert_sync_comm_ops(block, insert_idx, nrings, comm_dep_vars):
     """
     _insert_sync_comm_ops
     """
-    op_role = block.ops[insert_idx].attr('op_role')
+    op_role = get_valid_op_role(block, insert_idx)
     for i in range(nrings):
         block._insert_op_without_sync(
             insert_idx,
@@ -160,7 +175,7 @@ def insert_fill_constant_ops(block, insert_idx, fill_constant_vars):
     """
     _add_fill_constant_ops
     """
-    op_role = block.ops[insert_idx].attr('op_role')
+    op_role = get_valid_op_role(block, insert_idx)
     for broadcast_name in fill_constant_vars:
         broadcast_var = block.var(broadcast_name)
         block._insert_op_without_sync(
@@ -180,7 +195,7 @@ def insert_cast_ops(block, insert_idx, cast_ops):
     """
     _add_cast_ops
     """
-    op_role = block.ops[insert_idx].attr('op_role')
+    op_role = get_valid_op_role(block, insert_idx)
     for fp16_name, fp32_name in cast_ops.items():
         block._insert_op_without_sync(
             insert_idx,
@@ -217,7 +232,7 @@ def insert_broadcast_ops(block, insert_idx, nrings, broadcast2root):
     _add_broadcast_ops
     """
     ring_id = -1
-    op_role = block.ops[insert_idx].attr('op_role')
+    op_role = get_valid_op_role(block, insert_idx)
     for broadcast_name, root_device in broadcast2root:
         ring_id = (ring_id + 1) % nrings
         block._insert_op_without_sync(
@@ -272,3 +287,115 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
                 outputs={'Out': loss_grad_var},
                 attrs={'scale': scale,
                        OP_ROLE_KEY: OpRole.Backward})
+
+
+def comm_analyse(main_program):
+    """
+    Analyse the parameter size that need to be broadcast/allreduce during sharding training 
+    """
+    reduce_vars = {}
+    broadcast_vars = {}
+    block = main_program.global_block()
+    for op in block.ops:
+        if op.type == "c_broadcast":
+            var_name = op.desc.input_arg_names()[0]
+            broadcast_vars[var_name] = get_var_size(block.var(var_name))
+        elif op.type == "c_allreduce_sum":
+            var_name = op.desc.input_arg_names()[0]
+            reduce_vars[var_name] = get_var_size(block.var(var_name))
+
+    varsize_count = {}
+    gap = 1
+
+    for k, v in broadcast_vars.items():
+        print("broadcast: {}: {} KB".format(k, v))
+        if (int(v / gap) in varsize_count):
+            varsize_count[int(v / gap)] += 1
+        else:
+            varsize_count[int(v / gap)] = 1
+
+    for k, v in reduce_vars.items():
+        print("allreduce: {}: {} KB".format(k, v))
+        if (int(v / gap) in varsize_count):
+            varsize_count[int(v / gap)] += 1
+        else:
+            varsize_count[int(v / gap)] = 1
+
+    with open("nccl_size.txt", 'w') as f:
+        sorted_varsize = sorted(varsize_count.items(), key=lambda x: x[0])
+        for varsize, count in sorted_varsize:
+            print("NCCL size {}~{} KB: {}".format(varsize, varsize + 1, count))
+            f.write("NCCL size {}~{} KB: {}\n".format(varsize, varsize + 1,
+                                                      count))
+
+
+def add_sync_comm_for_test(program, dist_strategy):
+    """
+    When clone a test prog by clone from the sharding main prog, 
+    part of the sync_comm op maybe be pruned by mistake, this function
+    add the sync_comm op for the test prog.
+
+    """
+    #NOTE (liangjianzhong): only support one comm stream by now, use more than one 
+    # comm streams will cause error. should be revise in future.
+
+    block = program.global_block()
+    not_sync_vars = set([])
+    for op in block.ops:
+        if op.type in ["c_broadcast", "c_allreduce"]:
+            for input_name in op.desc.input_arg_names():
+                not_sync_vars.add(input_name)
+        if op.type == "c_sync_comm_stream":
+            for input_name in op.desc.input_arg_names():
+                not_sync_vars.remove(input_name)
+    if not_sync_vars:
+        for nccl_id in range(dist_strategy.nccl_comm_num):
+            block.append_op(
+                type='c_sync_comm_stream',
+                inputs={'X': list(not_sync_vars)},
+                outputs={'Out': list(not_sync_vars)},
+                attrs={
+                    'ring_id': nccl_id,
+                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+                })
+    return
+
+
+def sharding_save_persistables(exe, dirname, main_program, filename=None):
+    """
+    When use sharding, part of persistable vars are unique and are partitioned in different ranks,
+    and part of persistable vars are duplicated and exist in all the ranks with different values.
+    This function handles the model saving for sharding training.
+    """
+
+    def is_opt_vars(var):
+        # NOTE(liangjianzhong): The checks should be updated when add new compatible optimizer
+        # now only Momentum and adam are compatible with sharding
+        checks = [
+            "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
+            "_velocity_0"
+        ]
+        for check in checks:
+            if var.name.endswith(check):
+                return True
+        return False
+
+    def is_trainable(var):
+        return isinstance(var,
+                          paddle.fluid.framework.Parameter) and var.trainable
+
+    def sharding_predicate(var):
+        return is_trainable(var) or is_opt_vars(var)
+
+    if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
+        paddle.fluid.io.save_persistables(
+            exe, dirname, main_program=main_program, filename=None)
+    else:
+        paddle.fluid.io.save_vars(
+            exe,
+            dirname,
+            main_program=main_program,
+            predicate=sharding_predicate,
+            filename=None)
+
+    return
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
new file mode 100755
index 00000000000000..05578c9e4a57f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+# from paddle.fluid.incubate.fleet.collective import fleet
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import sharding_save_persistables
+
+import os
+import six
+import sys
+import pickle
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+def runtime_main():
+    import paddle.distributed.fleet as fleet
+
+    # model definition
+    train_prog = paddle.fluid.Program()
+    startup_prog = paddle.fluid.Program()
+    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+    fleet.init(role)
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                            size=64,
+                                            act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.sharding = True
+            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+
+            optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+    # execution
+    device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+    place = fluid.CUDAPlace(device_id)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+    dirname="./ut_sharding_save_model"  
+    sharding_save_persistables(exe, dirname, main_program=train_prog, filename=None)
+
+    out_losses=[]
+    if six.PY2:
+        print(pickle.dumps(out_losses))
+    else:
+        sys.stdout.buffer.write(pickle.dumps(out_losses))
+
+if __name__ == "__main__":
+    #NOTE(liangjianzhong): dist unittest should be imlpement using runtime_main in test_dist_base.py
+    # but the runtime_main in test_dist_base.py use the fleet, DistributedStrategy from 
+    # paddle.fluid.incubate.fleet.collective which is not support by sharding (paddle.distributed.fleet). 
+    # this should be update in future.
+    # runtime_main(TestDistMnist2x2)
+    runtime_main()
+   
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py b/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
new file mode 100755
index 00000000000000..b4620d7a0c5a8f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import shutil
+import os
+import unittest
+from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
+
+
+class TestDistMnistFleetSave(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._gpu_fleet_api = True
+        self._sharding_save = True
+        self._enforce_place = "GPU"
+
+
+    def _rm_temp_files(self, dirname):
+        shutil.rmtree(dirname)
+
+    def _test_saved_files(self, dirname):
+
+        sharding_save_files = sorted(os.listdir(dirname))
+
+        check_files = ['fc_0.b_0', 'fc_0.b_0_velocity_0', 'fc_0.w_0', 'fc_0.w_0_velocity_0', 'fc_1.b_0', 
+        'fc_1.b_0_velocity_0', 'fc_1.w_0', 'fc_1.w_0_velocity_0', 'fc_2.b_0', 
+        'fc_2.b_0_velocity_0', 'fc_2.w_0', 'fc_2.w_0_velocity_0', 'learning_rate_0']
+
+        if sharding_save_files != check_files:
+            self._rm_temp_files(dirname)
+            raise ValueError("Test Failed.")
+        self._rm_temp_files(dirname)
+
+        return True
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=True,
+                         need_envs={},
+                         log_name=""):
+        required_envs = self._get_required_envs(check_error_log, need_envs)
+
+        tr0_losses, tr1_losses = self._run_cluster_nccl2(
+            model_file,
+            required_envs,
+            False,
+            check_error_log,
+            log_name=log_name)
+
+        dirname = './ut_sharding_save_model'
+        self._test_saved_files(dirname)
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_sharding_save.py", delta=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
old mode 100644
new mode 100755
index 6a9f3e3ba7bf35..063ff726b10e4b
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -17,8 +17,11 @@
 import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 
 from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import add_sync_comm_for_test, sharding_save_persistables, comm_analyse
 
 paddle.enable_static()
 
@@ -270,6 +273,25 @@ def test_sharding_gradient_clip(self):
             'momentum'
         ])
 
+    def test_sharding_clone_for_test(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        comm_analyse(train_prog)
+        test_prog = train_prog.clone(for_test=True)
+        add_sync_comm_for_test(test_prog, strategy)
+        ops = [op.type for op in test_prog.global_block().ops]
+
+        self.assertEqual(ops, ['fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 
+        'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 
+        'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', 
+        'cross_entropy2', 'mean'])
+
+        
+
 
+        
 if __name__ == "__main__":
     unittest.main()

From 20b12765982d7ba152b0bf90bfd6cdb71bd6cd55 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 18 Nov 2020 20:58:50 +0800
Subject: [PATCH 54/56] faster the compare ops dygraph model speed

faster the compare ops dygraph model speed
---
 .../fluid/tests/unittests/test_compare_op.py  |  9 ++
 python/paddle/tensor/logic.py                 | 94 +++++++++++++++++--
 2 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 25ae65aa7c968b..63a43432b4e555 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -93,6 +93,15 @@ def test_api(self):
                                fetch_list=[out])
             self.assertEqual((res == self.real_result).all(), True)
 
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            x = paddle.to_tensor(self.input_x)
+            y = paddle.to_tensor(self.input_y)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == self.real_result).all(), True)
+            paddle.enable_static()
+
         def test_broadcast_api_1(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index da08270d742e54..839ecaa1fbaecd 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -216,7 +216,20 @@ def equal(x, y, name=None):
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
-    out = fluid.layers.equal(x, y, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.equal(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "equal")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "equal")
+    helper = LayerHelper("equal", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='equal', inputs={'X': [x],
+                              'Y': [y]}, outputs={'Out': [out]})
     return out
 
 
@@ -247,7 +260,22 @@ def greater_equal(x, y, name=None):
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
-    out = fluid.layers.greater_equal(x, y, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.greater_equal(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "greater_equal")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "greater_equal")
+    helper = LayerHelper("greater_equal", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='greater_equal',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [out]})
     return out
 
 
@@ -278,7 +306,22 @@ def greater_than(x, y, name=None):
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
-    out = fluid.layers.greater_than(x, y, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.greater_than(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "greater_than")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "greater_than")
+    helper = LayerHelper("greater_than", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='greater_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [out]})
     return out
 
 
@@ -310,7 +353,20 @@ def less_equal(x, y, name=None):
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
-    out = fluid.layers.less_equal(x, y, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.less_equal(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "less_equal")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "less_equal")
+    helper = LayerHelper("less_equal", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='less_equal', inputs={'X': [x],
+                                   'Y': [y]}, outputs={'Out': [out]})
     return out
 
 
@@ -342,7 +398,20 @@ def less_than(x, y, name=None):
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
-    out = fluid.layers.less_than(x, y, force_cpu=False, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.less_than(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "less_than")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "less_than")
+    helper = LayerHelper("less_than", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [out]})
     return out
 
 
@@ -375,7 +444,20 @@ def not_equal(x, y, name=None):
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
-    out = fluid.layers.not_equal(x, y, name=name, cond=None)
+    if in_dygraph_mode():
+        return core.ops.not_equal(x, y)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
+                             "not_equal")
+    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
+                             "not_equal")
+    helper = LayerHelper("not_equal", **locals())
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    out.stop_gradient = True
+
+    helper.append_op(
+        type='not_equal', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [out]})
     return out
 
 

From 19226ba8d682f1b41dfaef97761784415f1b8e0c Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Wed, 18 Nov 2020 21:09:46 +0800
Subject: [PATCH 55/56] Simplify the timeline, to remove the prefix of each
 event. (#28723)

---
 tools/timeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/timeline.py b/tools/timeline.py
index 44c1c09b803dfc..119018380b551c 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -58,7 +58,7 @@ def _create_event(self, ph, category, name, pid, tid, timestamp):
         event = {}
         event['ph'] = ph
         event['cat'] = category
-        event['name'] = name
+        event['name'] = name.replace("ParallelExecutor::Run/", "")
         event['pid'] = pid
         event['tid'] = tid
         event['ts'] = timestamp

From 3d09929b1f28b978a5f34dc6139546c4d7def323 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 18 Nov 2020 22:05:41 +0800
Subject: [PATCH 56/56] Add check for non-dispensable input (#28666)

* Add check for non-dispensable input

* fix typo
---
 paddle/fluid/pybind/op_function.h            | 16 ++++++++++++++--
 paddle/fluid/pybind/op_function_generator.cc |  7 ++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 70b321f658cd2c..1e20ac958b9bbb 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -36,9 +36,15 @@ namespace pybind {
 
 static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
     const std::string& op_type, const std::string& arg_name, int arg_idx,
-    const py::handle& handle) {
+    const py::handle& handle, bool dispensable = false) {
   PyObject* py_obj = handle.ptr();  // get underlying PyObject
   if (!py_obj || py_obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got "
+          "%s",
+          op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+    }
     return nullptr;
   }
   try {
@@ -54,9 +60,15 @@ static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
 static inline std::vector<std::shared_ptr<imperative::VarBase>>
 CastPyHandleToVarBaseList(const std::string& op_type,
                           const std::string& arg_name, int arg_idx,
-                          const py::handle& handle) {
+                          const py::handle& handle, bool dispensable = false) {
   PyObject* py_obj = handle.ptr();  // get underlying PyObject
   if (!py_obj || py_obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got "
+          "%s",
+          op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+    }
     return {};
   }
   std::vector<std::shared_ptr<imperative::VarBase>> result;
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 10914cf0ab7ba2..0f5ce841559462 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -166,10 +166,10 @@ const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
 const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
 
 const char* CAST_VAR_TEMPLATE = R"(
-  auto %s = CastPyHandleToVarBase("%s", "%s", %d, %s);)";
+  auto %s = CastPyHandleToVarBase("%s", "%s", %d, %s, %s);)";
 
 const char* CAST_VAR_LIST_TEMPLATE = R"(
-  auto %s = CastPyHandleToVarBaseList("%s", "%s", %d, %s);)";
+  auto %s = CastPyHandleToVarBaseList("%s", "%s", %d, %s, %s);)";
 
 
 const char* ARG_TEMPLATE = R"(const %s& %s)";
@@ -263,9 +263,10 @@ GenerateOpFunctions(const std::string& module_name) {
       input_args_num++;
       const auto in_cast_type =
           input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      auto dispensable = input.dispensable() ? "true" : "false";
       ins_cast_str +=
           paddle::string::Sprintf(in_cast_type, in_name, op_type, in_name,
-                                  arg_idx++, TempName(in_name));
+                                  arg_idx++, TempName(in_name), dispensable);
 
       if (input.dispensable()) {
         const auto in_template = input.duplicable()